wgit 0.0.18 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,28 +5,28 @@ require_relative 'database/database'
5
5
 
6
6
  module Wgit
7
7
  # Convience method to index the World Wide Web using
8
- # Wgit::Indexer#index_the_web.
8
+ # Wgit::Indexer#index_www.
9
9
  #
10
10
  # Retrieves uncrawled url's from the database and recursively crawls each
11
11
  # site storing their internal pages into the database and adding their
12
12
  # external url's to be crawled later on. Logs info on the crawl
13
13
  # using Wgit.logger as it goes along.
14
14
  #
15
- # @param max_sites_to_crawl [Integer] The number of separate and whole
15
+ # @param max_sites [Integer] The number of separate and whole
16
16
  # websites to be crawled before the method exits. Defaults to -1 which
17
17
  # means the crawl will occur until manually stopped (Ctrl+C etc).
18
- # @param max_data_size [Integer] The maximum amount of bytes that will be
18
+ # @param max_data [Integer] The maximum amount of bytes that will be
19
19
  # scraped from the web (default is 1GB). Note, that this value is used to
20
20
  # determine when to stop crawling; it's not a guarantee of the max data
21
21
  # that will be obtained.
22
- def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1_048_576_000)
22
+ def self.index_www(max_sites: -1, max_data: 1_048_576_000)
23
23
  db = Wgit::Database.new
24
24
  indexer = Wgit::Indexer.new(db)
25
- indexer.index_the_web(max_sites_to_crawl, max_data_size)
25
+ indexer.index_www(max_sites: max_sites, max_data: max_data)
26
26
  end
27
27
 
28
28
  # Convience method to index a single website using
29
- # Wgit::Indexer#index_this_site.
29
+ # Wgit::Indexer#index_site.
30
30
  #
31
31
  # Crawls a single website's pages and stores them into the database.
32
32
  # There is no max download limit so be careful which sites you index.
@@ -34,18 +34,18 @@ module Wgit
34
34
  # @param url [Wgit::Url, String] The base Url of the website to crawl.
35
35
  # @param insert_externals [Boolean] Whether or not to insert the website's
36
36
  # external Url's into the database.
37
- # @yield [Wgit::Document] Given the Wgit::Document of each crawled webpage,
38
- # before it is inserted into the database allowing for prior manipulation.
37
+ # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
38
+ # inserted into the database allowing for prior manipulation.
39
39
  # @return [Integer] The total number of pages crawled within the website.
40
- def self.index_this_site(url, insert_externals = true, &block)
41
- url = Wgit::Url.new url
40
+ def self.index_site(url, insert_externals: true, &block)
41
+ url = Wgit::Url.parse(url)
42
42
  db = Wgit::Database.new
43
43
  indexer = Wgit::Indexer.new(db)
44
- indexer.index_this_site(url, insert_externals, &block)
44
+ indexer.index_site(url, insert_externals: insert_externals, &block)
45
45
  end
46
46
 
47
47
  # Convience method to index a single webpage using
48
- # Wgit::Indexer#index_this_page.
48
+ # Wgit::Indexer#index_page.
49
49
  #
50
50
  # Crawls a single webpage and stores it into the database.
51
51
  # There is no max download limit so be careful of large pages.
@@ -53,36 +53,50 @@ module Wgit
53
53
  # @param url [Wgit::Url, String] The Url of the webpage to crawl.
54
54
  # @param insert_externals [Boolean] Whether or not to insert the website's
55
55
  # external Url's into the database.
56
- # @yield [Wgit::Document] Given the Wgit::Document of the crawled webpage,
57
- # before it is inserted into the database allowing for prior manipulation.
58
- def self.index_this_page(url, insert_externals = true, &block)
59
- url = Wgit::Url.new url
56
+ # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
57
+ # inserted into the database allowing for prior manipulation.
58
+ def self.index_page(url, insert_externals: true, &block)
59
+ url = Wgit::Url.parse(url)
60
60
  db = Wgit::Database.new
61
61
  indexer = Wgit::Indexer.new(db)
62
- indexer.index_this_page(url, insert_externals, &block)
62
+ indexer.index_page(url, insert_externals: insert_externals, &block)
63
63
  end
64
64
 
65
65
  # Performs a search of the database's indexed documents and pretty prints
66
- # the results. See Wgit::Database#search for details of the search.
66
+ # the results. See Wgit::Database#search and Wgit::Document#search for
67
+ # details of how the search works.
67
68
  #
68
69
  # @param query [String] The text query to search with.
70
+ # @param case_sensitive [Boolean] Whether character case must match.
69
71
  # @param whole_sentence [Boolean] Whether multiple words should be searched
70
72
  # for separately.
71
- # @param limit [Integer] The max number of results to return.
73
+ # @param limit [Integer] The max number of results to print.
72
74
  # @param skip [Integer] The number of DB records to skip.
73
- # @param sentence_length [Integer] The max length of each result's text
75
+ # @param sentence_limit [Integer] The max length of each result's text
74
76
  # snippet.
75
- # @yield [Wgit::Document] Given each search result (Wgit::Document).
76
- def self.indexed_search(query, whole_sentence = false, limit = 10,
77
- skip = 0, sentence_length = 80, &block)
78
- db = Wgit::Database.new
79
- results = db.search(query, whole_sentence, limit, skip, &block)
80
- Wgit::Utils.printf_search_results(results, query, false, sentence_length)
77
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
78
+ # database.
79
+ def self.indexed_search(query, case_sensitive: false, whole_sentence: false,
80
+ limit: 10, skip: 0, sentence_limit: 80, &block)
81
+ results = Wgit::Database.new.search(
82
+ query, case_sensitive: case_sensitive, whole_sentence: whole_sentence,
83
+ limit: limit, skip: skip, &block
84
+ )
85
+
86
+ results.each do |doc|
87
+ doc.search!(
88
+ query,
89
+ case_sensitive: case_sensitive,
90
+ whole_sentence: whole_sentence,
91
+ sentence_limit: sentence_limit)
92
+ end
93
+
94
+ Wgit::Utils.printf_search_results(results)
81
95
  end
82
96
 
83
97
  # Class which sets up a crawler and saves the indexed docs to a database.
84
98
  class Indexer
85
- # The crawler used to scrape the WWW.
99
+ # The crawler used to index the WWW.
86
100
  attr_reader :crawler
87
101
 
88
102
  # The database instance used to store Urls and Documents in.
@@ -91,74 +105,73 @@ module Wgit
91
105
  # Initialize the Indexer.
92
106
  #
93
107
  # @param database [Wgit::Database] The database instance (already
94
- # initialized with the correct connection details etc).
108
+ # initialized with the correct connection string etc).
95
109
  def initialize(database)
96
110
  @crawler = Wgit::Crawler.new
97
- @db = database
111
+ @db = database
98
112
  end
99
113
 
100
114
  # Retrieves uncrawled url's from the database and recursively crawls each
101
115
  # site storing their internal pages into the database and adding their
102
- # external url's to be crawled later on. Logs info on the crawl
103
- # using Wgit.logger as it goes along.
116
+ # external url's to be crawled later on. Logs info on the crawl using
117
+ # Wgit.logger as it goes along.
104
118
  #
105
- # @param max_sites_to_crawl [Integer] The number of separate and whole
119
+ # @param max_sites [Integer] The number of separate and whole
106
120
  # websites to be crawled before the method exits. Defaults to -1 which
107
121
  # means the crawl will occur until manually stopped (Ctrl+C etc).
108
- # @param max_data_size [Integer] The maximum amount of bytes that will be
122
+ # @param max_data [Integer] The maximum amount of bytes that will be
109
123
  # scraped from the web (default is 1GB). Note, that this value is used to
110
124
  # determine when to stop crawling; it's not a guarantee of the max data
111
125
  # that will be obtained.
112
- def index_the_web(max_sites_to_crawl = -1, max_data_size = 1_048_576_000)
113
- if max_sites_to_crawl < 0
114
- Wgit.logger.info("Indexing until the database has been filled or it runs out of \
115
- urls to crawl (which might be never).")
126
+ def index_www(max_sites: -1, max_data: 1_048_576_000)
127
+ if max_sites.negative?
128
+ Wgit.logger.info("Indexing until the database has been filled or it \
129
+ runs out of urls to crawl (which might be never).")
116
130
  end
117
131
  site_count = 0
118
132
 
119
- while keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
133
+ while keep_crawling?(site_count, max_sites, max_data)
120
134
  Wgit.logger.info("Current database size: #{@db.size}")
121
- @crawler.urls = @db.uncrawled_urls
122
135
 
123
- if @crawler.urls.empty?
136
+ uncrawled_urls = @db.uncrawled_urls(limit: 100)
137
+
138
+ if uncrawled_urls.empty?
124
139
  Wgit.logger.info('No urls to crawl, exiting.')
125
140
  return
126
141
  end
127
- Wgit.logger.info("Starting crawl loop for: #{@crawler.urls}")
142
+ Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
128
143
 
129
144
  docs_count = 0
130
145
  urls_count = 0
131
146
 
132
- @crawler.urls.each do |url|
133
- unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
134
- Wgit.logger.info("Reached max number of sites to crawl or database \
135
- capacity, exiting.")
147
+ uncrawled_urls.each do |url|
148
+ unless keep_crawling?(site_count, max_sites, max_data)
149
+ Wgit.logger.info("Reached max number of sites to crawl or \
150
+ database capacity, exiting.")
136
151
  return
137
152
  end
138
153
  site_count += 1
139
154
 
140
- url.crawled = true
141
- raise unless @db.update(url) == 1
142
-
143
155
  site_docs_count = 0
144
156
  ext_links = @crawler.crawl_site(url) do |doc|
145
- unless doc.empty?
146
- if write_doc_to_db(doc)
147
- docs_count += 1
148
- site_docs_count += 1
149
- end
157
+ if !doc.empty? && write_doc_to_db(doc)
158
+ docs_count += 1
159
+ site_docs_count += 1
150
160
  end
151
161
  end
152
162
 
163
+ raise 'Error updating url' unless @db.update(url) == 1
164
+
153
165
  urls_count += write_urls_to_db(ext_links)
166
+
154
167
  Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
155
168
  site: #{url}")
156
169
  end
157
170
 
158
- Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) overall for \
159
- this iteration.")
160
- Wgit.logger.info("Found and saved #{urls_count} external url(s) for the next \
161
- iteration.")
171
+ Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
172
+ overall for this iteration.")
173
+ Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
174
+ the next iteration.")
162
175
 
163
176
  nil
164
177
  end
@@ -171,30 +184,27 @@ iteration.")
171
184
  # @param url [Wgit::Url] The base Url of the website to crawl.
172
185
  # @param insert_externals [Boolean] Whether or not to insert the website's
173
186
  # external Url's into the database.
174
- # @yield [Wgit::Document] Given the Wgit::Document of each crawled web
175
- # page, before it is inserted into the database allowing for prior
176
- # manipulation. Return nil or false from the block to prevent the
177
- # document from being saved into the database.
187
+ # @yield [doc] Given the Wgit::Document of each crawled web page before
188
+ # it's inserted into the database allowing for prior manipulation. Return
189
+ # nil or false from the block to prevent the document from being saved
190
+ # into the database.
178
191
  # @return [Integer] The total number of webpages/documents indexed.
179
- def index_this_site(url, insert_externals = true)
192
+ def index_site(url, insert_externals: true)
180
193
  total_pages_indexed = 0
181
194
 
182
195
  ext_urls = @crawler.crawl_site(url) do |doc|
183
196
  result = true
184
197
  result = yield(doc) if block_given?
185
198
 
186
- if result
187
- if write_doc_to_db(doc)
188
- total_pages_indexed += 1
189
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
190
- end
199
+ if result && !doc.empty? && write_doc_to_db(doc)
200
+ total_pages_indexed += 1
201
+ Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
191
202
  end
192
203
  end
193
204
 
194
- url.crawled = true
195
205
  @db.url?(url) ? @db.update(url) : @db.insert(url)
196
206
 
197
- if insert_externals
207
+ if insert_externals && ext_urls
198
208
  write_urls_to_db(ext_urls)
199
209
  Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
200
210
  end
@@ -212,27 +222,24 @@ site: #{url}")
212
222
  # @param url [Wgit::Url] The webpage Url to crawl.
213
223
  # @param insert_externals [Boolean] Whether or not to insert the webpage's
214
224
  # external Url's into the database.
215
- # @yield [Wgit::Document] Given the Wgit::Document of the crawled webpage,
216
- # before it is inserted into the database allowing for prior
225
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
226
+ # before it's inserted into the database allowing for prior
217
227
  # manipulation. Return nil or false from the block to prevent the
218
228
  # document from being saved into the database.
219
- def index_this_page(url, insert_externals = true)
220
- document = @crawler.crawl_page(url) do |doc|
229
+ def index_page(url, insert_externals: true)
230
+ document = @crawler.crawl_url(url) do |doc|
221
231
  result = true
222
232
  result = yield(doc) if block_given?
223
233
 
224
- if result
225
- if write_doc_to_db(doc)
226
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
227
- end
234
+ if result && !doc.empty? && write_doc_to_db(doc)
235
+ Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
228
236
  end
229
237
  end
230
238
 
231
- url.crawled = true
232
239
  @db.url?(url) ? @db.update(url) : @db.insert(url)
233
240
 
234
- if insert_externals
235
- ext_urls = document.external_links
241
+ ext_urls = document&.external_links
242
+ if insert_externals && ext_urls
236
243
  write_urls_to_db(ext_urls)
237
244
  Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
238
245
  end
@@ -246,20 +253,16 @@ site: #{url}")
246
253
  # loop iteration.
247
254
  #
248
255
  # @param site_count [Integer] The current number of crawled sites.
249
- # @param max_sites_to_crawl [Integer] The maximum number of sites to crawl
250
- # before stopping.
251
- # @param max_data_size [Integer] The maximum amount of data to crawl before
256
+ # @param max_sites [Integer] The maximum number of sites to crawl
257
+ # before stopping. Use -1 for an infinite number of sites.
258
+ # @param max_data [Integer] The maximum amount of data to crawl before
252
259
  # stopping.
253
260
  # @return [Boolean] True if the crawl should continue, false otherwise.
254
- def keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
255
- return false if @db.size >= max_data_size
256
-
257
- # If max_sites_to_crawl is -1 for example then crawl away.
258
- if max_sites_to_crawl < 0
259
- true
260
- else
261
- site_count < max_sites_to_crawl
262
- end
261
+ def keep_crawling?(site_count, max_sites, max_data)
262
+ return false if @db.size >= max_data
263
+ return true if max_sites.negative?
264
+
265
+ site_count < max_sites
263
266
  end
264
267
 
265
268
  # Write the doc to the DB. Note that the unique url index on the documents
@@ -270,9 +273,11 @@ site: #{url}")
270
273
  def write_doc_to_db(doc)
271
274
  @db.insert(doc)
272
275
  Wgit.logger.info("Saved document for url: #{doc.url}")
276
+
273
277
  true
274
278
  rescue Mongo::Error::OperationFailure
275
279
  Wgit.logger.info("Document already exists: #{doc.url}")
280
+
276
281
  false
277
282
  end
278
283
 
@@ -283,6 +288,7 @@ site: #{url}")
283
288
  # @return [Boolean] True if the write was successful, false otherwise.
284
289
  def write_urls_to_db(urls)
285
290
  count = 0
291
+
286
292
  if urls.respond_to?(:each)
287
293
  urls.each do |url|
288
294
  @db.insert(url)
@@ -292,6 +298,7 @@ site: #{url}")
292
298
  Wgit.logger.info("Url already exists: #{url}")
293
299
  end
294
300
  end
301
+
295
302
  count
296
303
  end
297
304
  end
@@ -6,16 +6,18 @@ require 'logger'
6
6
 
7
7
  module Wgit
8
8
  # The Logger instance used by Wgit. Set your own custom logger after
9
- # requiring this file if needed.
9
+ # requiring this file as needed.
10
10
  @logger = nil
11
11
 
12
12
  # Returns the current Logger instance.
13
+ #
13
14
  # @return [Logger] The current Logger instance.
14
15
  def self.logger
15
16
  @logger
16
17
  end
17
18
 
18
19
  # Sets the current Logger instance.
20
+ #
19
21
  # @param logger [Logger] The Logger instance to use.
20
22
  # @return [Logger] The current Logger instance having being set.
21
23
  def self.logger=(logger)
@@ -23,6 +25,7 @@ module Wgit
23
25
  end
24
26
 
25
27
  # Returns the default Logger instance.
28
+ #
26
29
  # @return [Logger] The default Logger instance.
27
30
  def self.default_logger
28
31
  logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
@@ -33,6 +36,7 @@ module Wgit
33
36
  end
34
37
 
35
38
  # Sets the default Logger instance to be used by Wgit.
39
+ #
36
40
  # @return [Logger] The default Logger instance.
37
41
  def self.use_default_logger
38
42
  @logger = default_logger
@@ -6,10 +6,11 @@ require 'uri'
6
6
  require 'addressable/uri'
7
7
 
8
8
  module Wgit
9
- # Class modeling a web based URL.
9
+ # Class modeling a web based HTTP URL.
10
+ #
10
11
  # Can be an internal/relative link e.g. "about.html" or a full URL
11
- # e.g. "http://www.google.co.uk". Is a subclass of String and uses
12
- # 'uri' and 'addressable/uri' internally.
12
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
13
+ # 'addressable/uri' internally.
13
14
  class Url < String
14
15
  include Assertable
15
16
 
@@ -17,104 +18,73 @@ module Wgit
17
18
  # is also provided by this class.
18
19
  attr_reader :crawled
19
20
 
20
- # The date which the Url was crawled.
21
+ # The Time which the Url was crawled.
21
22
  attr_accessor :date_crawled
22
23
 
23
24
  # Initializes a new instance of Wgit::Url which represents a web based
24
25
  # HTTP URL.
25
26
  #
26
- # @param url_or_obj [String, Object#fetch#[]] Is either a String based
27
- # URL or an object representing a Database record e.g. a MongoDB
28
- # document/object.
29
- # @param crawled [Boolean] Whether or not the HTML of the URL's web
30
- # page has been scraped or not.
31
- # @param date_crawled [Time] Should only be provided if crawled is
32
- # true. A suitable object can be returned from
33
- # Wgit::Utils.time_stamp.
34
- # @raise [RuntimeError] If url_or_obj is an Object with missing methods.
35
- def initialize(url_or_obj, crawled = false, date_crawled = nil)
27
+ # @param url_or_obj [String, Wgit::Url, Object#fetch#[]] Is either a String
28
+ # based URL or an object representing a Database record e.g. a MongoDB
29
+ # document/object.
30
+ # @param crawled [Boolean] Whether or not the HTML of the URL's web page
31
+ # has been crawled or not. Only used if url_or_obj is a String.
32
+ # @param date_crawled [Time] Should only be provided if crawled is true. A
33
+ # suitable object can be returned from Wgit::Utils.time_stamp. Only used
34
+ # if url_or_obj is a String.
35
+ # @raise [StandardError] If url_or_obj is an Object with missing methods.
36
+ def initialize(url_or_obj, crawled: false, date_crawled: nil)
36
37
  # Init from a URL String.
37
38
  if url_or_obj.is_a?(String)
38
39
  url = url_or_obj.to_s
39
- # Else init from a database object/document.
40
+ # Else init from a Hash like object e.g. database object.
40
41
  else
41
42
  obj = url_or_obj
42
- assert_respond_to(obj, [:fetch, :[]])
43
+ assert_respond_to(obj, :fetch)
43
44
 
44
- url = obj.fetch('url') # Should always be present.
45
- crawled = obj.fetch('crawled', false)
46
- date_crawled = obj['date_crawled']
45
+ url = obj.fetch('url') # Should always be present.
46
+ crawled = obj.fetch('crawled', false)
47
+ date_crawled = obj.fetch('date_crawled', nil)
47
48
  end
48
49
 
49
- @uri = Addressable::URI.parse(url)
50
- @crawled = crawled
50
+ @uri = Addressable::URI.parse(url)
51
+ @crawled = crawled
51
52
  @date_crawled = date_crawled
52
53
 
53
54
  super(url)
54
55
  end
55
56
 
56
- # A class alias for Url.new.
57
+ # Initialises a new Wgit::Url instance from a String or subclass of String
58
+ # e.g. Wgit::Url. Any other obj type will raise an error.
57
59
  #
58
- # @param str [String] The URL string to parse.
59
- # @return [Wgit::Url] The parsed Url object.
60
- def self.parse(str)
61
- new(str)
62
- end
63
-
64
- # Raises an exception if url is not a valid HTTP URL.
60
+ # If obj is already a Wgit::Url then it will be returned as is to maintain
61
+ # it's state. Otherwise, a new Wgit::Url is instantiated and returned. This
62
+ # differs from Wgit::Url.new which always instantiates a new Wgit::Url.
65
63
  #
66
- # @param url [Wgit::Url, String] The Url to validate.
67
- # @raise [RuntimeError] If url is invalid.
68
- def self.validate(url)
69
- url = Wgit::Url.new(url)
70
- raise "Invalid url (or a relative link): #{url}" if url.relative_link?
71
- unless url.start_with?('http://') || url.start_with?('https://')
72
- raise "Invalid url (missing protocol prefix): #{url}"
73
- end
74
- if URI::DEFAULT_PARSER.make_regexp.match(url.normalise).nil?
75
- raise "Invalid url: #{url}"
76
- end
77
- end
78
-
79
- # Determines if the Url is valid or not.
64
+ # Note: Only use this method if you are allowing obj to be either a String
65
+ # or a Wgit::Url whose state you want to preserve e.g. when passing a URL
66
+ # to a crawl method which might redirect (calling Wgit::Url#replace). If
67
+ # you're sure of the type or don't care about preserving the state of the
68
+ # Wgit::Url, use Wgit::Url.new instead.
80
69
  #
81
- # @param url [Wgit::Url, String] The Url to validate.
82
- # @return [Boolean] True if valid, otherwise false.
83
- def self.valid?(url)
84
- Wgit::Url.validate(url)
85
- true
86
- rescue StandardError
87
- false
88
- end
70
+ # @param obj [Object] The object to parse, which #is_a?(String).
71
+ # @raise [StandardError] If obj.is_a?(String) is false.
72
+ # @return [Wgit::Url] A Wgit::Url instance.
73
+ def self.parse(obj)
74
+ raise 'Can only parse if obj#is_a?(String)' unless obj.is_a?(String)
89
75
 
90
- # Modifies the receiver url by prefixing it with a protocol.
91
- # Returns the url whether its been modified or not.
92
- # The default protocol prefix is http://.
93
- #
94
- # @param url [Wgit::Url, String] The url to be prefixed with a protocol.
95
- # @param https [Boolean] Whether the protocol prefix is https or http.
96
- # @return [Wgit::Url] The url with a protocol prefix.
97
- def self.prefix_protocol(url, https = false)
98
- unless url.start_with?('http://') || url.start_with?('https://')
99
- if https
100
- url.replace("https://#{url}")
101
- else
102
- url.replace("http://#{url}")
103
- end
104
- end
105
- url
76
+ # Return a Wgit::Url as is to avoid losing state e.g. date_crawled etc.
77
+ obj.is_a?(Wgit::Url) ? obj : new(obj)
106
78
  end
107
79
 
108
- # Concats the host and link Strings and returns the result.
80
+ # Sets the @crawled instance var, also setting @date_crawled to the
81
+ # current time or nil (depending on the bool value).
109
82
  #
110
- # @param host [Wgit::Url, String] The Url host.
111
- # @param link [Wgit::Url, String] The link to add to the host prefix.
112
- # @return [Wgit::Url] host + "/" + link
113
- def self.concat(host, link)
114
- host = Wgit::Url.new(host).without_trailing_slash
115
- link = Wgit::Url.new(link).without_leading_slash
116
- separator = (link.start_with?('#') || link.start_with?('?')) ? '' : '/'
117
- Wgit::Url.new(host + separator + link)
83
+ # @param bool [Boolean] True if self has been crawled, false otherwise.
84
+ # @return [Time, NilClass] Returns the date crawled, if set.
85
+ def crawled=(bool)
86
+ @crawled = bool
87
+ @date_crawled = bool ? Wgit::Utils.time_stamp : nil
118
88
  end
119
89
 
120
90
  # Overrides String#replace setting the new_url @uri and String value.
@@ -123,108 +93,138 @@ module Wgit
123
93
  # @return [String] The new URL value once set.
124
94
  def replace(new_url)
125
95
  @uri = Addressable::URI.parse(new_url)
96
+
126
97
  super(new_url)
127
98
  end
128
99
 
129
100
  # Returns true if self is a relative Url; false if absolute.
130
101
  #
131
102
  # All external links in a page are expected to have a protocol prefix e.g.
132
- # "http://", otherwise the link is treated as an internal link (regardless
103
+ # 'http://', otherwise the link is treated as an internal link (regardless
133
104
  # of whether it's valid or not). The only exception is if an opts arg is
134
- # provided and self is a page belonging to that arg type e.g. domain; then
105
+ # provided and self is a page belonging to that arg type e.g. host; then
135
106
  # the link is relative.
136
107
  #
137
- # @param opts [Hash] The options with which to check relativity.
108
+ # @param opts [Hash] The options with which to check relativity. Only one
109
+ # opts param should be provided. The provided opts param Url must be
110
+ # absolute and be prefixed with a protocol. Consider using the output of
111
+ # Wgit::Url#to_base which should work unless it's nil.
112
+ # @option opts [Wgit::Url, String] :base The Url base e.g.
113
+ # http://www.google.com/how which gives a base of
114
+ # 'http://www.google.com'.
138
115
  # @option opts [Wgit::Url, String] :host The Url host e.g.
139
116
  # http://www.google.com/how which gives a host of 'www.google.com'.
140
- # The host must be absolute and prefixed with a protocol.
141
117
  # @option opts [Wgit::Url, String] :domain The Url domain e.g.
142
- # http://www.google.com/how which gives a domain of 'google.com'. The
143
- # domain must be absolute and prefixed with a protocol.
118
+ # http://www.google.com/how which gives a domain of 'google.com'.
144
119
  # @option opts [Wgit::Url, String] :brand The Url brand e.g.
145
- # http://www.google.com/how which gives a domain of 'google'. The
146
- # brand must be absolute and prefixed with a protocol.
147
- # @raise [RuntimeError] If self is invalid e.g. empty.
120
+ # http://www.google.com/how which gives a domain of 'google'.
121
+ # @raise [StandardError] If self is invalid e.g. empty or an invalid opts
122
+ # param has been provided.
148
123
  # @return [Boolean] True if relative, false if absolute.
149
- def is_relative?(opts = {})
150
- opts = { host: nil, domain: nil, brand: nil }.merge(opts)
151
-
152
- raise "Invalid link: '#{self}'" if empty?
153
- if opts.values.count(nil) < (opts.length - 1)
154
- raise "Provide only one of: #{opts.keys}"
155
- end
124
+ def relative?(opts = {})
125
+ defaults = { base: nil, host: nil, domain: nil, brand: nil }
126
+ opts = defaults.merge(opts)
127
+ raise 'Url (self) cannot be empty' if empty?
156
128
 
157
- host = opts[:host]
158
- if host
159
- host = Wgit::Url.new(host)
160
- if host.to_base.nil?
161
- raise "Invalid host, must be absolute and contain protocol: #{host}"
162
- end
163
- end
129
+ return true if @uri.relative?
164
130
 
165
- domain = opts[:domain]
166
- if domain
167
- domain = Wgit::Url.new(domain)
168
- if domain.to_base.nil?
169
- raise "Invalid domain, must be absolute and contain protocol: #{domain}"
170
- end
171
- end
131
+ # Self is absolute but may be relative to the opts param e.g. host.
132
+ opts.select! { |_k, v| v }
133
+ raise "Provide only one of: #{defaults.keys}" if opts.length > 1
172
134
 
173
- brand = opts[:brand]
174
- if brand
175
- brand = Wgit::Url.new(brand)
176
- if brand.to_base.nil?
177
- raise "Invalid brand, must be absolute and contain protocol: #{brand}"
178
- end
179
- end
135
+ return false if opts.empty?
180
136
 
181
- if @uri.relative?
182
- true
137
+ type, url = opts.first
138
+ url = Wgit::Url.new(url)
139
+ raise "Invalid opts param value, Url must be absolute and contain \
140
+ protocol: #{url}" unless url.to_base
141
+
142
+ case type
143
+ when :base # http://www.google.com
144
+ to_base == url.to_base
145
+ when :host # www.google.com
146
+ to_host == url.to_host
147
+ when :domain # google.com
148
+ to_domain == url.to_domain
149
+ when :brand # google
150
+ to_brand == url.to_brand
183
151
  else
184
- return host ? to_host == host.to_host : false if host
185
- return domain ? to_domain == domain.to_domain : false if domain
186
- return brand ? to_brand == brand.to_brand : false if brand
187
-
188
- false
152
+ raise "Unknown opts param: :#{type}, use one of: #{defaults.keys}"
189
153
  end
190
154
  end
191
155
 
192
- # Determines if self is a valid Url or not.
156
+ # Returns true if self is an absolute Url; false if relative.
193
157
  #
194
- # @return [Boolean] True if valid, otherwise false.
195
- def valid?
196
- Wgit::Url.valid?(self)
158
+ # @return [Boolean] True if absolute, false if relative.
159
+ def absolute?
160
+ @uri.absolute?
197
161
  end
198
162
 
199
- # Concats self and the link.
163
+ # Returns if self is a valid and absolute HTTP Url or not.
200
164
  #
201
- # @param link [Wgit::Url, String] The link to concat with self.
202
- # @return [Wgit::Url] self + "/" + link
203
- def concat(link)
204
- Wgit::Url.concat(self, link)
165
+ # @return [Boolean] True if valid and absolute, otherwise false.
166
+ def valid?
167
+ return false if relative?
168
+ return false unless start_with?('http://') || start_with?('https://')
169
+ return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
170
+
171
+ true
205
172
  end
206
173
 
207
- # Sets the @crawled instance var, also setting @date_crawled to the
208
- # current time or nil (depending on the bool value).
174
+ # Concats self and path together before returning a new Url. Self is not
175
+ # modified.
209
176
  #
210
- # @param bool [Boolean] True if self has been crawled, false otherwise.
211
- def crawled=(bool)
212
- @crawled = bool
213
- @date_crawled = bool ? Wgit::Utils.time_stamp : nil
177
+ # @param path [Wgit::Url, String] The path to concat onto the end of self.
178
+ # @return [Wgit::Url] self + separator + path, separator depends on path.
179
+ def concat(path)
180
+ path = Wgit::Url.new(path)
181
+ raise 'path must be relative' unless path.is_relative?
182
+
183
+ path = path.without_leading_slash
184
+ separator = path.start_with?('#') || path.start_with?('?') ? '' : '/'
185
+
186
+ Wgit::Url.new(without_trailing_slash + separator + path)
214
187
  end
215
188
 
216
- # Normalises/escapes self and returns a new Wgit::Url.
189
+ # Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
217
190
  #
218
- # @return [Wgit::Url] An encoded version of self.
219
- def normalise
191
+ # @return [Wgit::Url] An escaped version of self.
192
+ def normalize
220
193
  Wgit::Url.new(@uri.normalize.to_s)
221
194
  end
222
195
 
196
+ # Modifies self by prefixing it with a protocol. Returns the url whether
197
+ # its been modified or not. The default protocol prefix is http://.
198
+ #
199
+ # @param protocol [Symbol] Either :http or :https.
200
+ # @return [Wgit::Url] The url with protocol prefix (having been modified).
201
+ def prefix_protocol(protocol: :http)
202
+ unless %i[http https].include?(protocol)
203
+ raise 'protocol must be :http or :https'
204
+ end
205
+
206
+ unless start_with?('http://') || start_with?('https://')
207
+ protocol == :http ? replace("http://#{url}") : replace("https://#{url}")
208
+ end
209
+
210
+ self
211
+ end
212
+
213
+ # Returns a Hash containing this Url's instance vars excluding @uri.
214
+ # Used when storing the URL in a Database e.g. MongoDB etc.
215
+ #
216
+ # @return [Hash] self's instance vars as a Hash.
217
+ def to_h
218
+ ignore = ['@uri']
219
+ h = Wgit::Utils.to_h(self, ignore: ignore)
220
+ Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
221
+ end
222
+
223
223
  # Returns a normalised URI object for this URL.
224
224
  #
225
225
  # @return [URI::HTTP, URI::HTTPS] The URI object of self.
226
226
  def to_uri
227
- URI(normalise)
227
+ URI(normalize)
228
228
  end
229
229
 
230
230
  # Returns self.
@@ -311,7 +311,7 @@ module Wgit
311
311
  # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
312
312
  #
313
313
  # @return [Wgit::Url, nil] Containing just the query string or nil.
314
- def to_query_string
314
+ def to_query
315
315
  query = @uri.query
316
316
  query ? Wgit::Url.new("?#{query}") : nil
317
317
  end
@@ -361,9 +361,8 @@ module Wgit
361
361
  #
362
362
  # @return [Wgit::Url] Self without leading or trailing slashes.
363
363
  def without_slashes
364
- self.
365
- without_leading_slash.
366
- without_trailing_slash
364
+ without_leading_slash
365
+ .without_trailing_slash
367
366
  end
368
367
 
369
368
  # Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
@@ -388,8 +387,8 @@ module Wgit
388
387
  # URL.
389
388
  #
390
389
  # @return [Wgit::Url] Self with the query string portion removed.
391
- def without_query_string
392
- query = to_query_string
390
+ def without_query
391
+ query = to_query
393
392
  without_query_string = query ? gsub(query, '') : self
394
393
 
395
394
  Wgit::Url.new(without_query_string)
@@ -410,56 +409,43 @@ module Wgit
410
409
  Wgit::Url.new(without_anchor)
411
410
  end
412
411
 
413
- # Returns true if self is a URL query string e.g. ?q=hello etc.
412
+ # Returns true if self is a URL query string e.g. ?q=hello etc. Note this
413
+ # shouldn't be used to determine if self contains a query.
414
414
  #
415
415
  # @return [Boolean] True if self is a query string, false otherwise.
416
- def is_query_string?
416
+ def query?
417
417
  start_with?('?')
418
418
  end
419
419
 
420
- # Returns true if self is a URL anchor/fragment e.g. #top etc.
420
+ # Returns true if self is a URL anchor/fragment e.g. #top etc. Note this
421
+ # shouldn't be used to determine if self contains an anchor/fragment.
421
422
  #
422
423
  # @return [Boolean] True if self is a anchor/fragment, false otherwise.
423
- def is_anchor?
424
+ def anchor?
424
425
  start_with?('#')
425
426
  end
426
427
 
427
- # Returns a Hash containing this Url's instance vars excluding @uri.
428
- # Used when storing the URL in a Database e.g. MongoDB etc.
429
- #
430
- # @return [Hash] self's instance vars as a Hash.
431
- def to_h
432
- ignore = ['@uri']
433
- h = Wgit::Utils.to_h(self, ignore)
434
- Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
435
- end
436
-
437
- alias uri to_uri
438
- alias url to_url
439
- alias scheme to_scheme
440
- alias to_protocol to_scheme
441
- alias protocol to_scheme
442
- alias host to_host
443
- alias domain to_domain
444
- alias brand to_brand
445
- alias base to_base
446
- alias path to_path
447
- alias endpoint to_endpoint
448
- alias query_string to_query_string
449
- alias query to_query_string
450
- alias anchor to_anchor
451
- alias to_fragment to_anchor
452
- alias fragment to_anchor
453
- alias extension to_extension
454
- alias without_query without_query_string
428
+ alias crawled? crawled
429
+ alias is_relative? relative?
430
+ alias is_absolute? absolute?
431
+ alias is_valid? valid?
432
+ alias normalise normalize
433
+ alias uri to_uri
434
+ alias url to_url
435
+ alias scheme to_scheme
436
+ alias host to_host
437
+ alias domain to_domain
438
+ alias brand to_brand
439
+ alias base to_base
440
+ alias path to_path
441
+ alias endpoint to_endpoint
442
+ alias query to_query
443
+ alias anchor to_anchor
444
+ alias fragment to_anchor
445
+ alias extension to_extension
455
446
  alias without_fragment without_anchor
456
- alias is_query? is_query_string?
457
- alias is_fragment? is_anchor?
458
- alias relative_link? is_relative?
459
- alias internal_link? is_relative?
460
- alias is_internal? is_relative?
461
- alias relative? is_relative?
462
- alias crawled? crawled
463
- alias normalize normalise
447
+ alias is_query? query?
448
+ alias is_anchor? anchor?
449
+ alias fragment? anchor?
464
450
  end
465
451
  end