wgit 0.0.18 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit.rb +0 -1
- data/lib/wgit/assertable.rb +20 -23
- data/lib/wgit/core_ext.rb +6 -14
- data/lib/wgit/crawler.rb +94 -183
- data/lib/wgit/database/database.rb +209 -185
- data/lib/wgit/database/model.rb +7 -7
- data/lib/wgit/document.rb +281 -241
- data/lib/wgit/indexer.rb +99 -92
- data/lib/wgit/logger.rb +5 -1
- data/lib/wgit/url.rb +171 -185
- data/lib/wgit/utils.rb +57 -68
- data/lib/wgit/version.rb +1 -1
- metadata +86 -60
- data/CHANGELOG.md +0 -61
- data/LICENSE.txt +0 -21
- data/README.md +0 -361
- data/TODO.txt +0 -34
- data/lib/wgit/database/connection_details.rb +0 -41
data/lib/wgit/indexer.rb
CHANGED
@@ -5,28 +5,28 @@ require_relative 'database/database'
|
|
5
5
|
|
6
6
|
module Wgit
|
7
7
|
# Convience method to index the World Wide Web using
|
8
|
-
# Wgit::Indexer#
|
8
|
+
# Wgit::Indexer#index_www.
|
9
9
|
#
|
10
10
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
11
11
|
# site storing their internal pages into the database and adding their
|
12
12
|
# external url's to be crawled later on. Logs info on the crawl
|
13
13
|
# using Wgit.logger as it goes along.
|
14
14
|
#
|
15
|
-
# @param
|
15
|
+
# @param max_sites [Integer] The number of separate and whole
|
16
16
|
# websites to be crawled before the method exits. Defaults to -1 which
|
17
17
|
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
18
|
-
# @param
|
18
|
+
# @param max_data [Integer] The maximum amount of bytes that will be
|
19
19
|
# scraped from the web (default is 1GB). Note, that this value is used to
|
20
20
|
# determine when to stop crawling; it's not a guarantee of the max data
|
21
21
|
# that will be obtained.
|
22
|
-
def self.
|
22
|
+
def self.index_www(max_sites: -1, max_data: 1_048_576_000)
|
23
23
|
db = Wgit::Database.new
|
24
24
|
indexer = Wgit::Indexer.new(db)
|
25
|
-
indexer.
|
25
|
+
indexer.index_www(max_sites: max_sites, max_data: max_data)
|
26
26
|
end
|
27
27
|
|
28
28
|
# Convience method to index a single website using
|
29
|
-
# Wgit::Indexer#
|
29
|
+
# Wgit::Indexer#index_site.
|
30
30
|
#
|
31
31
|
# Crawls a single website's pages and stores them into the database.
|
32
32
|
# There is no max download limit so be careful which sites you index.
|
@@ -34,18 +34,18 @@ module Wgit
|
|
34
34
|
# @param url [Wgit::Url, String] The base Url of the website to crawl.
|
35
35
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
36
36
|
# external Url's into the database.
|
37
|
-
# @yield [
|
38
|
-
#
|
37
|
+
# @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
|
38
|
+
# inserted into the database allowing for prior manipulation.
|
39
39
|
# @return [Integer] The total number of pages crawled within the website.
|
40
|
-
def self.
|
41
|
-
url = Wgit::Url.
|
40
|
+
def self.index_site(url, insert_externals: true, &block)
|
41
|
+
url = Wgit::Url.parse(url)
|
42
42
|
db = Wgit::Database.new
|
43
43
|
indexer = Wgit::Indexer.new(db)
|
44
|
-
indexer.
|
44
|
+
indexer.index_site(url, insert_externals: insert_externals, &block)
|
45
45
|
end
|
46
46
|
|
47
47
|
# Convience method to index a single webpage using
|
48
|
-
# Wgit::Indexer#
|
48
|
+
# Wgit::Indexer#index_page.
|
49
49
|
#
|
50
50
|
# Crawls a single webpage and stores it into the database.
|
51
51
|
# There is no max download limit so be careful of large pages.
|
@@ -53,36 +53,50 @@ module Wgit
|
|
53
53
|
# @param url [Wgit::Url, String] The Url of the webpage to crawl.
|
54
54
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
55
55
|
# external Url's into the database.
|
56
|
-
# @yield [
|
57
|
-
#
|
58
|
-
def self.
|
59
|
-
url = Wgit::Url.
|
56
|
+
# @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
|
57
|
+
# inserted into the database allowing for prior manipulation.
|
58
|
+
def self.index_page(url, insert_externals: true, &block)
|
59
|
+
url = Wgit::Url.parse(url)
|
60
60
|
db = Wgit::Database.new
|
61
61
|
indexer = Wgit::Indexer.new(db)
|
62
|
-
indexer.
|
62
|
+
indexer.index_page(url, insert_externals: insert_externals, &block)
|
63
63
|
end
|
64
64
|
|
65
65
|
# Performs a search of the database's indexed documents and pretty prints
|
66
|
-
# the results. See Wgit::Database#search
|
66
|
+
# the results. See Wgit::Database#search and Wgit::Document#search for
|
67
|
+
# details of how the search works.
|
67
68
|
#
|
68
69
|
# @param query [String] The text query to search with.
|
70
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
69
71
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
70
72
|
# for separately.
|
71
|
-
# @param limit [Integer] The max number of results to
|
73
|
+
# @param limit [Integer] The max number of results to print.
|
72
74
|
# @param skip [Integer] The number of DB records to skip.
|
73
|
-
# @param
|
75
|
+
# @param sentence_limit [Integer] The max length of each result's text
|
74
76
|
# snippet.
|
75
|
-
# @yield [
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
results =
|
80
|
-
|
77
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
78
|
+
# database.
|
79
|
+
def self.indexed_search(query, case_sensitive: false, whole_sentence: false,
|
80
|
+
limit: 10, skip: 0, sentence_limit: 80, &block)
|
81
|
+
results = Wgit::Database.new.search(
|
82
|
+
query, case_sensitive: case_sensitive, whole_sentence: whole_sentence,
|
83
|
+
limit: limit, skip: skip, &block
|
84
|
+
)
|
85
|
+
|
86
|
+
results.each do |doc|
|
87
|
+
doc.search!(
|
88
|
+
query,
|
89
|
+
case_sensitive: case_sensitive,
|
90
|
+
whole_sentence: whole_sentence,
|
91
|
+
sentence_limit: sentence_limit)
|
92
|
+
end
|
93
|
+
|
94
|
+
Wgit::Utils.printf_search_results(results)
|
81
95
|
end
|
82
96
|
|
83
97
|
# Class which sets up a crawler and saves the indexed docs to a database.
|
84
98
|
class Indexer
|
85
|
-
# The crawler used to
|
99
|
+
# The crawler used to index the WWW.
|
86
100
|
attr_reader :crawler
|
87
101
|
|
88
102
|
# The database instance used to store Urls and Documents in.
|
@@ -91,74 +105,73 @@ module Wgit
|
|
91
105
|
# Initialize the Indexer.
|
92
106
|
#
|
93
107
|
# @param database [Wgit::Database] The database instance (already
|
94
|
-
# initialized with the correct connection
|
108
|
+
# initialized with the correct connection string etc).
|
95
109
|
def initialize(database)
|
96
110
|
@crawler = Wgit::Crawler.new
|
97
|
-
@db
|
111
|
+
@db = database
|
98
112
|
end
|
99
113
|
|
100
114
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
101
115
|
# site storing their internal pages into the database and adding their
|
102
|
-
# external url's to be crawled later on. Logs info on the crawl
|
103
|
-
#
|
116
|
+
# external url's to be crawled later on. Logs info on the crawl using
|
117
|
+
# Wgit.logger as it goes along.
|
104
118
|
#
|
105
|
-
# @param
|
119
|
+
# @param max_sites [Integer] The number of separate and whole
|
106
120
|
# websites to be crawled before the method exits. Defaults to -1 which
|
107
121
|
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
108
|
-
# @param
|
122
|
+
# @param max_data [Integer] The maximum amount of bytes that will be
|
109
123
|
# scraped from the web (default is 1GB). Note, that this value is used to
|
110
124
|
# determine when to stop crawling; it's not a guarantee of the max data
|
111
125
|
# that will be obtained.
|
112
|
-
def
|
113
|
-
if
|
114
|
-
Wgit.logger.info("Indexing until the database has been filled or it
|
115
|
-
urls to crawl (which might be never).")
|
126
|
+
def index_www(max_sites: -1, max_data: 1_048_576_000)
|
127
|
+
if max_sites.negative?
|
128
|
+
Wgit.logger.info("Indexing until the database has been filled or it \
|
129
|
+
runs out of urls to crawl (which might be never).")
|
116
130
|
end
|
117
131
|
site_count = 0
|
118
132
|
|
119
|
-
while keep_crawling?(site_count,
|
133
|
+
while keep_crawling?(site_count, max_sites, max_data)
|
120
134
|
Wgit.logger.info("Current database size: #{@db.size}")
|
121
|
-
@crawler.urls = @db.uncrawled_urls
|
122
135
|
|
123
|
-
|
136
|
+
uncrawled_urls = @db.uncrawled_urls(limit: 100)
|
137
|
+
|
138
|
+
if uncrawled_urls.empty?
|
124
139
|
Wgit.logger.info('No urls to crawl, exiting.')
|
125
140
|
return
|
126
141
|
end
|
127
|
-
Wgit.logger.info("Starting crawl loop for: #{
|
142
|
+
Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
|
128
143
|
|
129
144
|
docs_count = 0
|
130
145
|
urls_count = 0
|
131
146
|
|
132
|
-
|
133
|
-
unless keep_crawling?(site_count,
|
134
|
-
Wgit.logger.info("Reached max number of sites to crawl or
|
135
|
-
capacity, exiting.")
|
147
|
+
uncrawled_urls.each do |url|
|
148
|
+
unless keep_crawling?(site_count, max_sites, max_data)
|
149
|
+
Wgit.logger.info("Reached max number of sites to crawl or \
|
150
|
+
database capacity, exiting.")
|
136
151
|
return
|
137
152
|
end
|
138
153
|
site_count += 1
|
139
154
|
|
140
|
-
url.crawled = true
|
141
|
-
raise unless @db.update(url) == 1
|
142
|
-
|
143
155
|
site_docs_count = 0
|
144
156
|
ext_links = @crawler.crawl_site(url) do |doc|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
site_docs_count += 1
|
149
|
-
end
|
157
|
+
if !doc.empty? && write_doc_to_db(doc)
|
158
|
+
docs_count += 1
|
159
|
+
site_docs_count += 1
|
150
160
|
end
|
151
161
|
end
|
152
162
|
|
163
|
+
raise 'Error updating url' unless @db.update(url) == 1
|
164
|
+
|
153
165
|
urls_count += write_urls_to_db(ext_links)
|
166
|
+
|
154
167
|
Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
|
155
168
|
site: #{url}")
|
156
169
|
end
|
157
170
|
|
158
|
-
Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s)
|
159
|
-
this iteration.")
|
160
|
-
Wgit.logger.info("Found and saved #{urls_count} external url(s) for
|
161
|
-
iteration.")
|
171
|
+
Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
|
172
|
+
overall for this iteration.")
|
173
|
+
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
174
|
+
the next iteration.")
|
162
175
|
|
163
176
|
nil
|
164
177
|
end
|
@@ -171,30 +184,27 @@ iteration.")
|
|
171
184
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
172
185
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
173
186
|
# external Url's into the database.
|
174
|
-
# @yield [
|
175
|
-
#
|
176
|
-
#
|
177
|
-
#
|
187
|
+
# @yield [doc] Given the Wgit::Document of each crawled web page before
|
188
|
+
# it's inserted into the database allowing for prior manipulation. Return
|
189
|
+
# nil or false from the block to prevent the document from being saved
|
190
|
+
# into the database.
|
178
191
|
# @return [Integer] The total number of webpages/documents indexed.
|
179
|
-
def
|
192
|
+
def index_site(url, insert_externals: true)
|
180
193
|
total_pages_indexed = 0
|
181
194
|
|
182
195
|
ext_urls = @crawler.crawl_site(url) do |doc|
|
183
196
|
result = true
|
184
197
|
result = yield(doc) if block_given?
|
185
198
|
|
186
|
-
if result
|
187
|
-
|
188
|
-
|
189
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
190
|
-
end
|
199
|
+
if result && !doc.empty? && write_doc_to_db(doc)
|
200
|
+
total_pages_indexed += 1
|
201
|
+
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
191
202
|
end
|
192
203
|
end
|
193
204
|
|
194
|
-
url.crawled = true
|
195
205
|
@db.url?(url) ? @db.update(url) : @db.insert(url)
|
196
206
|
|
197
|
-
if insert_externals
|
207
|
+
if insert_externals && ext_urls
|
198
208
|
write_urls_to_db(ext_urls)
|
199
209
|
Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
|
200
210
|
end
|
@@ -212,27 +222,24 @@ site: #{url}")
|
|
212
222
|
# @param url [Wgit::Url] The webpage Url to crawl.
|
213
223
|
# @param insert_externals [Boolean] Whether or not to insert the webpage's
|
214
224
|
# external Url's into the database.
|
215
|
-
# @yield [
|
216
|
-
# before it
|
225
|
+
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
226
|
+
# before it's inserted into the database allowing for prior
|
217
227
|
# manipulation. Return nil or false from the block to prevent the
|
218
228
|
# document from being saved into the database.
|
219
|
-
def
|
220
|
-
document = @crawler.
|
229
|
+
def index_page(url, insert_externals: true)
|
230
|
+
document = @crawler.crawl_url(url) do |doc|
|
221
231
|
result = true
|
222
232
|
result = yield(doc) if block_given?
|
223
233
|
|
224
|
-
if result
|
225
|
-
|
226
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
227
|
-
end
|
234
|
+
if result && !doc.empty? && write_doc_to_db(doc)
|
235
|
+
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
228
236
|
end
|
229
237
|
end
|
230
238
|
|
231
|
-
url.crawled = true
|
232
239
|
@db.url?(url) ? @db.update(url) : @db.insert(url)
|
233
240
|
|
234
|
-
|
235
|
-
|
241
|
+
ext_urls = document&.external_links
|
242
|
+
if insert_externals && ext_urls
|
236
243
|
write_urls_to_db(ext_urls)
|
237
244
|
Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
|
238
245
|
end
|
@@ -246,20 +253,16 @@ site: #{url}")
|
|
246
253
|
# loop iteration.
|
247
254
|
#
|
248
255
|
# @param site_count [Integer] The current number of crawled sites.
|
249
|
-
# @param
|
250
|
-
# before stopping.
|
251
|
-
# @param
|
256
|
+
# @param max_sites [Integer] The maximum number of sites to crawl
|
257
|
+
# before stopping. Use -1 for an infinite number of sites.
|
258
|
+
# @param max_data [Integer] The maximum amount of data to crawl before
|
252
259
|
# stopping.
|
253
260
|
# @return [Boolean] True if the crawl should continue, false otherwise.
|
254
|
-
def keep_crawling?(site_count,
|
255
|
-
return false if @db.size >=
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
true
|
260
|
-
else
|
261
|
-
site_count < max_sites_to_crawl
|
262
|
-
end
|
261
|
+
def keep_crawling?(site_count, max_sites, max_data)
|
262
|
+
return false if @db.size >= max_data
|
263
|
+
return true if max_sites.negative?
|
264
|
+
|
265
|
+
site_count < max_sites
|
263
266
|
end
|
264
267
|
|
265
268
|
# Write the doc to the DB. Note that the unique url index on the documents
|
@@ -270,9 +273,11 @@ site: #{url}")
|
|
270
273
|
def write_doc_to_db(doc)
|
271
274
|
@db.insert(doc)
|
272
275
|
Wgit.logger.info("Saved document for url: #{doc.url}")
|
276
|
+
|
273
277
|
true
|
274
278
|
rescue Mongo::Error::OperationFailure
|
275
279
|
Wgit.logger.info("Document already exists: #{doc.url}")
|
280
|
+
|
276
281
|
false
|
277
282
|
end
|
278
283
|
|
@@ -283,6 +288,7 @@ site: #{url}")
|
|
283
288
|
# @return [Boolean] True if the write was successful, false otherwise.
|
284
289
|
def write_urls_to_db(urls)
|
285
290
|
count = 0
|
291
|
+
|
286
292
|
if urls.respond_to?(:each)
|
287
293
|
urls.each do |url|
|
288
294
|
@db.insert(url)
|
@@ -292,6 +298,7 @@ site: #{url}")
|
|
292
298
|
Wgit.logger.info("Url already exists: #{url}")
|
293
299
|
end
|
294
300
|
end
|
301
|
+
|
295
302
|
count
|
296
303
|
end
|
297
304
|
end
|
data/lib/wgit/logger.rb
CHANGED
@@ -6,16 +6,18 @@ require 'logger'
|
|
6
6
|
|
7
7
|
module Wgit
|
8
8
|
# The Logger instance used by Wgit. Set your own custom logger after
|
9
|
-
# requiring this file
|
9
|
+
# requiring this file as needed.
|
10
10
|
@logger = nil
|
11
11
|
|
12
12
|
# Returns the current Logger instance.
|
13
|
+
#
|
13
14
|
# @return [Logger] The current Logger instance.
|
14
15
|
def self.logger
|
15
16
|
@logger
|
16
17
|
end
|
17
18
|
|
18
19
|
# Sets the current Logger instance.
|
20
|
+
#
|
19
21
|
# @param logger [Logger] The Logger instance to use.
|
20
22
|
# @return [Logger] The current Logger instance having being set.
|
21
23
|
def self.logger=(logger)
|
@@ -23,6 +25,7 @@ module Wgit
|
|
23
25
|
end
|
24
26
|
|
25
27
|
# Returns the default Logger instance.
|
28
|
+
#
|
26
29
|
# @return [Logger] The default Logger instance.
|
27
30
|
def self.default_logger
|
28
31
|
logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
|
@@ -33,6 +36,7 @@ module Wgit
|
|
33
36
|
end
|
34
37
|
|
35
38
|
# Sets the default Logger instance to be used by Wgit.
|
39
|
+
#
|
36
40
|
# @return [Logger] The default Logger instance.
|
37
41
|
def self.use_default_logger
|
38
42
|
@logger = default_logger
|
data/lib/wgit/url.rb
CHANGED
@@ -6,10 +6,11 @@ require 'uri'
|
|
6
6
|
require 'addressable/uri'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class modeling a web based URL.
|
9
|
+
# Class modeling a web based HTTP URL.
|
10
|
+
#
|
10
11
|
# Can be an internal/relative link e.g. "about.html" or a full URL
|
11
|
-
# e.g. "http://www.google.co.uk". Is a subclass of String and uses
|
12
|
-
# '
|
12
|
+
# e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
|
13
|
+
# 'addressable/uri' internally.
|
13
14
|
class Url < String
|
14
15
|
include Assertable
|
15
16
|
|
@@ -17,104 +18,73 @@ module Wgit
|
|
17
18
|
# is also provided by this class.
|
18
19
|
attr_reader :crawled
|
19
20
|
|
20
|
-
# The
|
21
|
+
# The Time which the Url was crawled.
|
21
22
|
attr_accessor :date_crawled
|
22
23
|
|
23
24
|
# Initializes a new instance of Wgit::Url which represents a web based
|
24
25
|
# HTTP URL.
|
25
26
|
#
|
26
|
-
# @param url_or_obj [String, Object#fetch#[]] Is either a String
|
27
|
-
#
|
28
|
-
#
|
29
|
-
# @param crawled [Boolean] Whether or not the HTML of the URL's web
|
30
|
-
#
|
31
|
-
# @param date_crawled [Time] Should only be provided if crawled is
|
32
|
-
#
|
33
|
-
#
|
34
|
-
# @raise [
|
35
|
-
def initialize(url_or_obj, crawled
|
27
|
+
# @param url_or_obj [String, Wgit::Url, Object#fetch#[]] Is either a String
|
28
|
+
# based URL or an object representing a Database record e.g. a MongoDB
|
29
|
+
# document/object.
|
30
|
+
# @param crawled [Boolean] Whether or not the HTML of the URL's web page
|
31
|
+
# has been crawled or not. Only used if url_or_obj is a String.
|
32
|
+
# @param date_crawled [Time] Should only be provided if crawled is true. A
|
33
|
+
# suitable object can be returned from Wgit::Utils.time_stamp. Only used
|
34
|
+
# if url_or_obj is a String.
|
35
|
+
# @raise [StandardError] If url_or_obj is an Object with missing methods.
|
36
|
+
def initialize(url_or_obj, crawled: false, date_crawled: nil)
|
36
37
|
# Init from a URL String.
|
37
38
|
if url_or_obj.is_a?(String)
|
38
39
|
url = url_or_obj.to_s
|
39
|
-
# Else init from a database object
|
40
|
+
# Else init from a Hash like object e.g. database object.
|
40
41
|
else
|
41
42
|
obj = url_or_obj
|
42
|
-
assert_respond_to(obj,
|
43
|
+
assert_respond_to(obj, :fetch)
|
43
44
|
|
44
|
-
url
|
45
|
-
crawled
|
46
|
-
date_crawled = obj
|
45
|
+
url = obj.fetch('url') # Should always be present.
|
46
|
+
crawled = obj.fetch('crawled', false)
|
47
|
+
date_crawled = obj.fetch('date_crawled', nil)
|
47
48
|
end
|
48
49
|
|
49
|
-
@uri
|
50
|
-
@crawled
|
50
|
+
@uri = Addressable::URI.parse(url)
|
51
|
+
@crawled = crawled
|
51
52
|
@date_crawled = date_crawled
|
52
53
|
|
53
54
|
super(url)
|
54
55
|
end
|
55
56
|
|
56
|
-
#
|
57
|
+
# Initialises a new Wgit::Url instance from a String or subclass of String
|
58
|
+
# e.g. Wgit::Url. Any other obj type will raise an error.
|
57
59
|
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
|
61
|
-
new(str)
|
62
|
-
end
|
63
|
-
|
64
|
-
# Raises an exception if url is not a valid HTTP URL.
|
60
|
+
# If obj is already a Wgit::Url then it will be returned as is to maintain
|
61
|
+
# it's state. Otherwise, a new Wgit::Url is instantiated and returned. This
|
62
|
+
# differs from Wgit::Url.new which always instantiates a new Wgit::Url.
|
65
63
|
#
|
66
|
-
#
|
67
|
-
#
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
unless url.start_with?('http://') || url.start_with?('https://')
|
72
|
-
raise "Invalid url (missing protocol prefix): #{url}"
|
73
|
-
end
|
74
|
-
if URI::DEFAULT_PARSER.make_regexp.match(url.normalise).nil?
|
75
|
-
raise "Invalid url: #{url}"
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
# Determines if the Url is valid or not.
|
64
|
+
# Note: Only use this method if you are allowing obj to be either a String
|
65
|
+
# or a Wgit::Url whose state you want to preserve e.g. when passing a URL
|
66
|
+
# to a crawl method which might redirect (calling Wgit::Url#replace). If
|
67
|
+
# you're sure of the type or don't care about preserving the state of the
|
68
|
+
# Wgit::Url, use Wgit::Url.new instead.
|
80
69
|
#
|
81
|
-
# @param
|
82
|
-
# @
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
rescue StandardError
|
87
|
-
false
|
88
|
-
end
|
70
|
+
# @param obj [Object] The object to parse, which #is_a?(String).
|
71
|
+
# @raise [StandardError] If obj.is_a?(String) is false.
|
72
|
+
# @return [Wgit::Url] A Wgit::Url instance.
|
73
|
+
def self.parse(obj)
|
74
|
+
raise 'Can only parse if obj#is_a?(String)' unless obj.is_a?(String)
|
89
75
|
|
90
|
-
|
91
|
-
|
92
|
-
# The default protocol prefix is http://.
|
93
|
-
#
|
94
|
-
# @param url [Wgit::Url, String] The url to be prefixed with a protocol.
|
95
|
-
# @param https [Boolean] Whether the protocol prefix is https or http.
|
96
|
-
# @return [Wgit::Url] The url with a protocol prefix.
|
97
|
-
def self.prefix_protocol(url, https = false)
|
98
|
-
unless url.start_with?('http://') || url.start_with?('https://')
|
99
|
-
if https
|
100
|
-
url.replace("https://#{url}")
|
101
|
-
else
|
102
|
-
url.replace("http://#{url}")
|
103
|
-
end
|
104
|
-
end
|
105
|
-
url
|
76
|
+
# Return a Wgit::Url as is to avoid losing state e.g. date_crawled etc.
|
77
|
+
obj.is_a?(Wgit::Url) ? obj : new(obj)
|
106
78
|
end
|
107
79
|
|
108
|
-
#
|
80
|
+
# Sets the @crawled instance var, also setting @date_crawled to the
|
81
|
+
# current time or nil (depending on the bool value).
|
109
82
|
#
|
110
|
-
# @param
|
111
|
-
# @
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
link = Wgit::Url.new(link).without_leading_slash
|
116
|
-
separator = (link.start_with?('#') || link.start_with?('?')) ? '' : '/'
|
117
|
-
Wgit::Url.new(host + separator + link)
|
83
|
+
# @param bool [Boolean] True if self has been crawled, false otherwise.
|
84
|
+
# @return [Time, NilClass] Returns the date crawled, if set.
|
85
|
+
def crawled=(bool)
|
86
|
+
@crawled = bool
|
87
|
+
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
118
88
|
end
|
119
89
|
|
120
90
|
# Overrides String#replace setting the new_url @uri and String value.
|
@@ -123,108 +93,138 @@ module Wgit
|
|
123
93
|
# @return [String] The new URL value once set.
|
124
94
|
def replace(new_url)
|
125
95
|
@uri = Addressable::URI.parse(new_url)
|
96
|
+
|
126
97
|
super(new_url)
|
127
98
|
end
|
128
99
|
|
129
100
|
# Returns true if self is a relative Url; false if absolute.
|
130
101
|
#
|
131
102
|
# All external links in a page are expected to have a protocol prefix e.g.
|
132
|
-
#
|
103
|
+
# 'http://', otherwise the link is treated as an internal link (regardless
|
133
104
|
# of whether it's valid or not). The only exception is if an opts arg is
|
134
|
-
# provided and self is a page belonging to that arg type e.g.
|
105
|
+
# provided and self is a page belonging to that arg type e.g. host; then
|
135
106
|
# the link is relative.
|
136
107
|
#
|
137
|
-
# @param opts [Hash] The options with which to check relativity.
|
108
|
+
# @param opts [Hash] The options with which to check relativity. Only one
|
109
|
+
# opts param should be provided. The provided opts param Url must be
|
110
|
+
# absolute and be prefixed with a protocol. Consider using the output of
|
111
|
+
# Wgit::Url#to_base which should work unless it's nil.
|
112
|
+
# @option opts [Wgit::Url, String] :base The Url base e.g.
|
113
|
+
# http://www.google.com/how which gives a base of
|
114
|
+
# 'http://www.google.com'.
|
138
115
|
# @option opts [Wgit::Url, String] :host The Url host e.g.
|
139
116
|
# http://www.google.com/how which gives a host of 'www.google.com'.
|
140
|
-
# The host must be absolute and prefixed with a protocol.
|
141
117
|
# @option opts [Wgit::Url, String] :domain The Url domain e.g.
|
142
|
-
# http://www.google.com/how which gives a domain of 'google.com'.
|
143
|
-
# domain must be absolute and prefixed with a protocol.
|
118
|
+
# http://www.google.com/how which gives a domain of 'google.com'.
|
144
119
|
# @option opts [Wgit::Url, String] :brand The Url brand e.g.
|
145
|
-
# http://www.google.com/how which gives a domain of 'google'.
|
146
|
-
#
|
147
|
-
#
|
120
|
+
# http://www.google.com/how which gives a domain of 'google'.
|
121
|
+
# @raise [StandardError] If self is invalid e.g. empty or an invalid opts
|
122
|
+
# param has been provided.
|
148
123
|
# @return [Boolean] True if relative, false if absolute.
|
149
|
-
def
|
150
|
-
|
151
|
-
|
152
|
-
raise
|
153
|
-
if opts.values.count(nil) < (opts.length - 1)
|
154
|
-
raise "Provide only one of: #{opts.keys}"
|
155
|
-
end
|
124
|
+
def relative?(opts = {})
|
125
|
+
defaults = { base: nil, host: nil, domain: nil, brand: nil }
|
126
|
+
opts = defaults.merge(opts)
|
127
|
+
raise 'Url (self) cannot be empty' if empty?
|
156
128
|
|
157
|
-
|
158
|
-
if host
|
159
|
-
host = Wgit::Url.new(host)
|
160
|
-
if host.to_base.nil?
|
161
|
-
raise "Invalid host, must be absolute and contain protocol: #{host}"
|
162
|
-
end
|
163
|
-
end
|
129
|
+
return true if @uri.relative?
|
164
130
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
if domain.to_base.nil?
|
169
|
-
raise "Invalid domain, must be absolute and contain protocol: #{domain}"
|
170
|
-
end
|
171
|
-
end
|
131
|
+
# Self is absolute but may be relative to the opts param e.g. host.
|
132
|
+
opts.select! { |_k, v| v }
|
133
|
+
raise "Provide only one of: #{defaults.keys}" if opts.length > 1
|
172
134
|
|
173
|
-
|
174
|
-
if brand
|
175
|
-
brand = Wgit::Url.new(brand)
|
176
|
-
if brand.to_base.nil?
|
177
|
-
raise "Invalid brand, must be absolute and contain protocol: #{brand}"
|
178
|
-
end
|
179
|
-
end
|
135
|
+
return false if opts.empty?
|
180
136
|
|
181
|
-
|
182
|
-
|
137
|
+
type, url = opts.first
|
138
|
+
url = Wgit::Url.new(url)
|
139
|
+
raise "Invalid opts param value, Url must be absolute and contain \
|
140
|
+
protocol: #{url}" unless url.to_base
|
141
|
+
|
142
|
+
case type
|
143
|
+
when :base # http://www.google.com
|
144
|
+
to_base == url.to_base
|
145
|
+
when :host # www.google.com
|
146
|
+
to_host == url.to_host
|
147
|
+
when :domain # google.com
|
148
|
+
to_domain == url.to_domain
|
149
|
+
when :brand # google
|
150
|
+
to_brand == url.to_brand
|
183
151
|
else
|
184
|
-
|
185
|
-
return domain ? to_domain == domain.to_domain : false if domain
|
186
|
-
return brand ? to_brand == brand.to_brand : false if brand
|
187
|
-
|
188
|
-
false
|
152
|
+
raise "Unknown opts param: :#{type}, use one of: #{defaults.keys}"
|
189
153
|
end
|
190
154
|
end
|
191
155
|
|
192
|
-
#
|
156
|
+
# Returns true if self is an absolute Url; false if relative.
|
193
157
|
#
|
194
|
-
# @return [Boolean] True if
|
195
|
-
def
|
196
|
-
|
158
|
+
# @return [Boolean] True if absolute, false if relative.
|
159
|
+
def absolute?
|
160
|
+
@uri.absolute?
|
197
161
|
end
|
198
162
|
|
199
|
-
#
|
163
|
+
# Returns if self is a valid and absolute HTTP Url or not.
|
200
164
|
#
|
201
|
-
# @
|
202
|
-
|
203
|
-
|
204
|
-
|
165
|
+
# @return [Boolean] True if valid and absolute, otherwise false.
|
166
|
+
def valid?
|
167
|
+
return false if relative?
|
168
|
+
return false unless start_with?('http://') || start_with?('https://')
|
169
|
+
return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
|
170
|
+
|
171
|
+
true
|
205
172
|
end
|
206
173
|
|
207
|
-
#
|
208
|
-
#
|
174
|
+
# Concats self and path together before returning a new Url. Self is not
|
175
|
+
# modified.
|
209
176
|
#
|
210
|
-
# @param
|
211
|
-
|
212
|
-
|
213
|
-
|
177
|
+
# @param path [Wgit::Url, String] The path to concat onto the end of self.
|
178
|
+
# @return [Wgit::Url] self + separator + path, separator depends on path.
|
179
|
+
def concat(path)
|
180
|
+
path = Wgit::Url.new(path)
|
181
|
+
raise 'path must be relative' unless path.is_relative?
|
182
|
+
|
183
|
+
path = path.without_leading_slash
|
184
|
+
separator = path.start_with?('#') || path.start_with?('?') ? '' : '/'
|
185
|
+
|
186
|
+
Wgit::Url.new(without_trailing_slash + separator + path)
|
214
187
|
end
|
215
188
|
|
216
|
-
# Normalises/escapes self and returns a new Wgit::Url.
|
189
|
+
# Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
|
217
190
|
#
|
218
|
-
# @return [Wgit::Url] An
|
219
|
-
def
|
191
|
+
# @return [Wgit::Url] An escaped version of self.
|
192
|
+
def normalize
|
220
193
|
Wgit::Url.new(@uri.normalize.to_s)
|
221
194
|
end
|
222
195
|
|
196
|
+
# Modifies self by prefixing it with a protocol. Returns the url whether
|
197
|
+
# its been modified or not. The default protocol prefix is http://.
|
198
|
+
#
|
199
|
+
# @param protocol [Symbol] Either :http or :https.
|
200
|
+
# @return [Wgit::Url] The url with protocol prefix (having been modified).
|
201
|
+
def prefix_protocol(protocol: :http)
|
202
|
+
unless %i[http https].include?(protocol)
|
203
|
+
raise 'protocol must be :http or :https'
|
204
|
+
end
|
205
|
+
|
206
|
+
unless start_with?('http://') || start_with?('https://')
|
207
|
+
protocol == :http ? replace("http://#{url}") : replace("https://#{url}")
|
208
|
+
end
|
209
|
+
|
210
|
+
self
|
211
|
+
end
|
212
|
+
|
213
|
+
# Returns a Hash containing this Url's instance vars excluding @uri.
|
214
|
+
# Used when storing the URL in a Database e.g. MongoDB etc.
|
215
|
+
#
|
216
|
+
# @return [Hash] self's instance vars as a Hash.
|
217
|
+
def to_h
|
218
|
+
ignore = ['@uri']
|
219
|
+
h = Wgit::Utils.to_h(self, ignore: ignore)
|
220
|
+
Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
|
221
|
+
end
|
222
|
+
|
223
223
|
# Returns a normalised URI object for this URL.
|
224
224
|
#
|
225
225
|
# @return [URI::HTTP, URI::HTTPS] The URI object of self.
|
226
226
|
def to_uri
|
227
|
-
URI(
|
227
|
+
URI(normalize)
|
228
228
|
end
|
229
229
|
|
230
230
|
# Returns self.
|
@@ -311,7 +311,7 @@ module Wgit
|
|
311
311
|
# e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
|
312
312
|
#
|
313
313
|
# @return [Wgit::Url, nil] Containing just the query string or nil.
|
314
|
-
def
|
314
|
+
def to_query
|
315
315
|
query = @uri.query
|
316
316
|
query ? Wgit::Url.new("?#{query}") : nil
|
317
317
|
end
|
@@ -361,9 +361,8 @@ module Wgit
|
|
361
361
|
#
|
362
362
|
# @return [Wgit::Url] Self without leading or trailing slashes.
|
363
363
|
def without_slashes
|
364
|
-
|
365
|
-
|
366
|
-
without_trailing_slash
|
364
|
+
without_leading_slash
|
365
|
+
.without_trailing_slash
|
367
366
|
end
|
368
367
|
|
369
368
|
# Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
|
@@ -388,8 +387,8 @@ module Wgit
|
|
388
387
|
# URL.
|
389
388
|
#
|
390
389
|
# @return [Wgit::Url] Self with the query string portion removed.
|
391
|
-
def
|
392
|
-
query =
|
390
|
+
def without_query
|
391
|
+
query = to_query
|
393
392
|
without_query_string = query ? gsub(query, '') : self
|
394
393
|
|
395
394
|
Wgit::Url.new(without_query_string)
|
@@ -410,56 +409,43 @@ module Wgit
|
|
410
409
|
Wgit::Url.new(without_anchor)
|
411
410
|
end
|
412
411
|
|
413
|
-
# Returns true if self is a URL query string e.g. ?q=hello etc.
|
412
|
+
# Returns true if self is a URL query string e.g. ?q=hello etc. Note this
|
413
|
+
# shouldn't be used to determine if self contains a query.
|
414
414
|
#
|
415
415
|
# @return [Boolean] True if self is a query string, false otherwise.
|
416
|
-
def
|
416
|
+
def query?
|
417
417
|
start_with?('?')
|
418
418
|
end
|
419
419
|
|
420
|
-
# Returns true if self is a URL anchor/fragment e.g. #top etc.
|
420
|
+
# Returns true if self is a URL anchor/fragment e.g. #top etc. Note this
|
421
|
+
# shouldn't be used to determine if self contains an anchor/fragment.
|
421
422
|
#
|
422
423
|
# @return [Boolean] True if self is a anchor/fragment, false otherwise.
|
423
|
-
def
|
424
|
+
def anchor?
|
424
425
|
start_with?('#')
|
425
426
|
end
|
426
427
|
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
alias
|
438
|
-
alias
|
439
|
-
alias
|
440
|
-
alias
|
441
|
-
alias
|
442
|
-
alias
|
443
|
-
alias
|
444
|
-
alias
|
445
|
-
alias base to_base
|
446
|
-
alias path to_path
|
447
|
-
alias endpoint to_endpoint
|
448
|
-
alias query_string to_query_string
|
449
|
-
alias query to_query_string
|
450
|
-
alias anchor to_anchor
|
451
|
-
alias to_fragment to_anchor
|
452
|
-
alias fragment to_anchor
|
453
|
-
alias extension to_extension
|
454
|
-
alias without_query without_query_string
|
428
|
+
alias crawled? crawled
|
429
|
+
alias is_relative? relative?
|
430
|
+
alias is_absolute? absolute?
|
431
|
+
alias is_valid? valid?
|
432
|
+
alias normalise normalize
|
433
|
+
alias uri to_uri
|
434
|
+
alias url to_url
|
435
|
+
alias scheme to_scheme
|
436
|
+
alias host to_host
|
437
|
+
alias domain to_domain
|
438
|
+
alias brand to_brand
|
439
|
+
alias base to_base
|
440
|
+
alias path to_path
|
441
|
+
alias endpoint to_endpoint
|
442
|
+
alias query to_query
|
443
|
+
alias anchor to_anchor
|
444
|
+
alias fragment to_anchor
|
445
|
+
alias extension to_extension
|
455
446
|
alias without_fragment without_anchor
|
456
|
-
alias is_query?
|
457
|
-
alias
|
458
|
-
alias
|
459
|
-
alias internal_link? is_relative?
|
460
|
-
alias is_internal? is_relative?
|
461
|
-
alias relative? is_relative?
|
462
|
-
alias crawled? crawled
|
463
|
-
alias normalize normalise
|
447
|
+
alias is_query? query?
|
448
|
+
alias is_anchor? anchor?
|
449
|
+
alias fragment? anchor?
|
464
450
|
end
|
465
451
|
end
|