wgit 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/wgit.rb +1 -1
- data/lib/wgit/assertable.rb +72 -61
- data/lib/wgit/core_ext.rb +11 -5
- data/lib/wgit/crawler.rb +97 -57
- data/lib/wgit/database/database.rb +247 -170
- data/lib/wgit/database/model.rb +40 -24
- data/lib/wgit/database/mongo_connection_details.rb +44 -23
- data/lib/wgit/document.rb +534 -233
- data/lib/wgit/indexer.rb +235 -0
- data/lib/wgit/url.rb +199 -121
- data/lib/wgit/utils.rb +143 -96
- data/lib/wgit/version.rb +5 -1
- metadata +10 -9
- data/lib/wgit/web_crawler.rb +0 -134
data/lib/wgit/indexer.rb
ADDED
@@ -0,0 +1,235 @@
|
|
1
|
+
require_relative 'crawler'
|
2
|
+
require_relative 'database/database'
|
3
|
+
|
4
|
+
module Wgit
|
5
|
+
|
6
|
+
# Convience method to index the World Wide Web using
|
7
|
+
# Wgit::Indexer#index_the_web.
|
8
|
+
#
|
9
|
+
# Retrieves uncrawled url's from the database and recursively crawls each
|
10
|
+
# site storing their internal pages into the database and adding their
|
11
|
+
# external url's to be crawled at a later date. Puts out info on the crawl
|
12
|
+
# to STDOUT as it goes along.
|
13
|
+
#
|
14
|
+
# @param max_sites_to_crawl [Integer] The number of separate and whole
|
15
|
+
# websites to be crawled before the method exits. Defaults to -1 which
|
16
|
+
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
17
|
+
# @param max_data_size [Integer] The maximum amount of bytes that will be
|
18
|
+
# scraped from the web (default is 1GB). Note, that this value is used to
|
19
|
+
# determine when to stop crawling; it's not a guarantee of the max data
|
20
|
+
# that will be obtained.
|
21
|
+
def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
22
|
+
db = Wgit::Database.new
|
23
|
+
indexer = Wgit::Indexer.new(db)
|
24
|
+
indexer.index_the_web(max_sites_to_crawl, max_data_size)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Convience method to index a single website using
|
28
|
+
# Wgit::Indexer#index_this_site.
|
29
|
+
#
|
30
|
+
# Crawls a single website's pages and stores them into the database.
|
31
|
+
# There is no max download limit so be careful which sites you index.
|
32
|
+
#
|
33
|
+
# @param url [Wgit::Url, String] The base Url of the website to crawl.
|
34
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
35
|
+
# external Url's into the database.
|
36
|
+
# @yield [doc] Given the Wgit::Document of each crawled web page, before it
|
37
|
+
# is inserted into the database allowing for prior manipulation.
|
38
|
+
# @return [Integer] The total number of pages crawled within the website.
|
39
|
+
def self.index_this_site(url, insert_externals = true, &block)
|
40
|
+
url = Wgit::Url.new url
|
41
|
+
db = Wgit::Database.new
|
42
|
+
indexer = Wgit::Indexer.new(db)
|
43
|
+
indexer.index_this_site(url, insert_externals, &block)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Performs a search of the database's indexed documents and pretty prints
|
47
|
+
# the results. See Wgit::Database#search for details of the search.
|
48
|
+
#
|
49
|
+
# @param query [String] The text query to search with.
|
50
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
51
|
+
# for separately.
|
52
|
+
# @param limit [Integer] The max number of results to return.
|
53
|
+
# @param skip [Integer] The number of DB records to skip.
|
54
|
+
# @param sentence_length [Integer] The max length of each result's text
|
55
|
+
# snippet.
|
56
|
+
# @yield [doc] Given each search result (Wgit::Document).
|
57
|
+
def self.indexed_search(query, whole_sentence = false, limit = 10,
|
58
|
+
skip = 0, sentence_length = 80, &block)
|
59
|
+
db = Wgit::Database.new
|
60
|
+
results = db.search(query, whole_sentence, limit, skip, &block)
|
61
|
+
Wgit::Utils.printf_search_results(results, query, false, sentence_length)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Class which sets up a crawler and saves the indexed docs to a database.
|
65
|
+
class Indexer
|
66
|
+
|
67
|
+
# The crawler used to scrape the WWW.
|
68
|
+
attr_reader :crawler
|
69
|
+
|
70
|
+
# The database instance used to store Urls and Documents in.
|
71
|
+
attr_reader :db
|
72
|
+
|
73
|
+
# Initialize the Indexer.
|
74
|
+
#
|
75
|
+
# @param database [Wgit::Database] The database instance (already
|
76
|
+
# initialized with the correct connection details etc).
|
77
|
+
def initialize(database)
|
78
|
+
@crawler = Wgit::Crawler.new
|
79
|
+
@db = database
|
80
|
+
end
|
81
|
+
|
82
|
+
# Retrieves uncrawled url's from the database and recursively crawls each
|
83
|
+
# site storing their internal pages into the database and adding their
|
84
|
+
# external url's to be crawled at a later date. Puts out info on the crawl
|
85
|
+
# to STDOUT as it goes along.
|
86
|
+
#
|
87
|
+
# @param max_sites_to_crawl [Integer] The number of separate and whole
|
88
|
+
# websites to be crawled before the method exits. Defaults to -1 which
|
89
|
+
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
90
|
+
# @param max_data_size [Integer] The maximum amount of bytes that will be
|
91
|
+
# scraped from the web (default is 1GB). Note, that this value is used to
|
92
|
+
# determine when to stop crawling; it's not a guarantee of the max data
|
93
|
+
# that will be obtained.
|
94
|
+
def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
95
|
+
if max_sites_to_crawl < 0
|
96
|
+
puts "Indexing until the database has been filled or it runs out of \
|
97
|
+
urls to crawl (which might be never)."
|
98
|
+
end
|
99
|
+
site_count = 0
|
100
|
+
|
101
|
+
while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
|
102
|
+
puts "Current database size: #{@db.size}"
|
103
|
+
@crawler.urls = @db.uncrawled_urls
|
104
|
+
|
105
|
+
if @crawler.urls.empty?
|
106
|
+
puts "No urls to crawl, exiting."
|
107
|
+
return
|
108
|
+
end
|
109
|
+
puts "Starting crawl loop for: #{@crawler.urls}"
|
110
|
+
|
111
|
+
docs_count = 0
|
112
|
+
urls_count = 0
|
113
|
+
|
114
|
+
@crawler.urls.each do |url|
|
115
|
+
unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
|
116
|
+
puts "Reached max number of sites to crawl or database \
|
117
|
+
capacity, exiting."
|
118
|
+
return
|
119
|
+
end
|
120
|
+
site_count += 1
|
121
|
+
|
122
|
+
url.crawled = true
|
123
|
+
raise unless @db.update(url) == 1
|
124
|
+
|
125
|
+
site_docs_count = 0
|
126
|
+
ext_links = @crawler.crawl_site(url) do |doc|
|
127
|
+
unless doc.empty?
|
128
|
+
if write_doc_to_db(doc)
|
129
|
+
docs_count += 1
|
130
|
+
site_docs_count += 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
urls_count += write_urls_to_db(ext_links)
|
136
|
+
puts "Crawled and saved #{site_docs_count} docs for the \
|
137
|
+
site: #{url}"
|
138
|
+
end
|
139
|
+
|
140
|
+
puts "Crawled and saved docs for #{docs_count} url(s) overall for \
|
141
|
+
this iteration."
|
142
|
+
puts "Found and saved #{urls_count} external url(s) for the next \
|
143
|
+
iteration."
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Crawls a single website's pages and stores them into the database.
|
148
|
+
# There is no max download limit so be careful which sites you index.
|
149
|
+
# Puts out info on the crawl to STDOUT as it goes along.
|
150
|
+
#
|
151
|
+
# @param url [Wgit::Url] The base Url of the website to crawl.
|
152
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
153
|
+
# external Url's into the database.
|
154
|
+
# @yield [doc] Given the Wgit::Document of each crawled web page, before it
|
155
|
+
# is inserted into the database allowing for prior manipulation. Return
|
156
|
+
# nil or false from the block to prevent the document from being saved
|
157
|
+
# into the database.
|
158
|
+
# @return [Integer] The total number of webpages/documents indexed.
|
159
|
+
def index_this_site(url, insert_externals = true)
|
160
|
+
total_pages_indexed = 0
|
161
|
+
|
162
|
+
ext_urls = @crawler.crawl_site(url) do |doc|
|
163
|
+
result = true
|
164
|
+
if block_given?
|
165
|
+
result = yield(doc)
|
166
|
+
end
|
167
|
+
|
168
|
+
if result
|
169
|
+
if write_doc_to_db(doc)
|
170
|
+
total_pages_indexed += 1
|
171
|
+
puts "Crawled and saved internal page: #{doc.url}"
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
url.crawled = true
|
177
|
+
if !@db.url?(url)
|
178
|
+
@db.insert(url)
|
179
|
+
else
|
180
|
+
@db.update(url)
|
181
|
+
end
|
182
|
+
|
183
|
+
if insert_externals
|
184
|
+
write_urls_to_db(ext_urls)
|
185
|
+
puts "Found and saved #{ext_urls.length} external url(s)"
|
186
|
+
end
|
187
|
+
|
188
|
+
puts "Crawled and saved #{total_pages_indexed} docs for the \
|
189
|
+
site: #{url}"
|
190
|
+
|
191
|
+
total_pages_indexed
|
192
|
+
end
|
193
|
+
|
194
|
+
private
|
195
|
+
|
196
|
+
# Keep crawling or not based on DB size and current loop iteration.
|
197
|
+
def keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
|
198
|
+
return false if @db.size >= max_data_size
|
199
|
+
# If max_sites_to_crawl is -1 for example then crawl away.
|
200
|
+
if max_sites_to_crawl < 0
|
201
|
+
true
|
202
|
+
else
|
203
|
+
site_count < max_sites_to_crawl
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
# The unique url index on the documents collection prevents duplicate
|
208
|
+
# inserts.
|
209
|
+
def write_doc_to_db(doc)
|
210
|
+
@db.insert(doc)
|
211
|
+
puts "Saved document for url: #{doc.url}"
|
212
|
+
true
|
213
|
+
rescue Mongo::Error::OperationFailure
|
214
|
+
puts "Document already exists: #{doc.url}"
|
215
|
+
false
|
216
|
+
end
|
217
|
+
|
218
|
+
# The unique url index on the urls collection prevents duplicate inserts.
|
219
|
+
def write_urls_to_db(urls)
|
220
|
+
count = 0
|
221
|
+
if urls.respond_to?(:each)
|
222
|
+
urls.each do |url|
|
223
|
+
begin
|
224
|
+
@db.insert(url)
|
225
|
+
count += 1
|
226
|
+
puts "Inserted url: #{url}"
|
227
|
+
rescue Mongo::Error::OperationFailure
|
228
|
+
puts "Url already exists: #{url}"
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
count
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
data/lib/wgit/url.rb
CHANGED
@@ -1,140 +1,218 @@
|
|
1
1
|
require_relative 'utils'
|
2
|
+
require_relative 'assertable'
|
2
3
|
require 'uri'
|
3
4
|
|
4
5
|
module Wgit
|
5
6
|
|
6
|
-
# @author Michael Telford
|
7
7
|
# Class modeling a web based URL.
|
8
|
-
# Can be an internal link e.g. "about.html"
|
9
|
-
#
|
8
|
+
# Can be an internal/relative link e.g. "about.html" or a full URL
|
9
|
+
# e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri'
|
10
|
+
# internally.
|
10
11
|
class Url < String
|
11
|
-
|
12
|
+
include Assertable
|
12
13
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
rescue
|
44
|
-
false
|
45
|
-
end
|
46
|
-
|
47
|
-
# Modifies the receiver url by prefixing it with a protocol.
|
48
|
-
# Returns the url whether its been modified or not.
|
49
|
-
def self.prefix_protocol(url, https = false)
|
50
|
-
unless url.start_with?("http://") or url.start_with?("https://")
|
51
|
-
if https
|
52
|
-
url.replace("https://#{url}")
|
53
|
-
else
|
54
|
-
url.replace("http://#{url}")
|
55
|
-
end
|
56
|
-
end
|
57
|
-
url
|
58
|
-
end
|
59
|
-
|
60
|
-
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
61
|
-
# array[2]: "www.google.co.uk", array[5]: "/about.html".
|
62
|
-
# This means that all external links in a page are expected to have a
|
63
|
-
# protocol prefix e.g. "http://", otherwise the link is treated as an
|
64
|
-
# internal link (regardless of whether it is valid or not).
|
65
|
-
def self.relative_link?(link)
|
66
|
-
link_segs = URI.split(link)
|
67
|
-
if not link_segs[2].nil? and not link_segs[2].empty?
|
68
|
-
false
|
69
|
-
elsif not link_segs[5].nil? and not link_segs[5].empty?
|
70
|
-
true
|
71
|
-
else
|
72
|
-
raise "Invalid link: #{link}"
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def self.concat(host, link)
|
77
|
-
url = host
|
78
|
-
url.chop! if url.end_with?("/")
|
79
|
-
link = link[1..-1] if link.start_with?("/")
|
80
|
-
Wgit::Url.new(url + "/" + link)
|
81
|
-
end
|
82
|
-
|
83
|
-
def relative_link?
|
84
|
-
Wgit::Url.relative_link?(self)
|
85
|
-
end
|
86
|
-
|
87
|
-
def valid?
|
88
|
-
Wgit::Url.valid?(self)
|
14
|
+
# Whether or not the Url has been crawled or not.
|
15
|
+
attr_accessor :crawled
|
16
|
+
|
17
|
+
# The date which the Url was crawled.
|
18
|
+
attr_accessor :date_crawled
|
19
|
+
|
20
|
+
# Initializes a new instance of Wgit::Url which represents a web based
|
21
|
+
# HTTP URL.
|
22
|
+
#
|
23
|
+
# @param url_or_obj [String, Object#fetch#[]] Is either a String based
|
24
|
+
# URL or an object representing a Database record e.g. a MongoDB
|
25
|
+
# document/object.
|
26
|
+
# @param crawled [Boolean] Whether or not the HTML of the URL's web
|
27
|
+
# page has been scraped or not.
|
28
|
+
# @param date_crawled [Time] Should only be provided if crawled is
|
29
|
+
# true. A suitable object can be returned from
|
30
|
+
# Wgit::Utils.time_stamp.
|
31
|
+
# @raise [RuntimeError] If url_or_obj is an Object with missing methods.
|
32
|
+
def initialize(url_or_obj, crawled = false, date_crawled = nil)
|
33
|
+
# Init from a URL String.
|
34
|
+
if url_or_obj.is_a?(String)
|
35
|
+
url = url_or_obj.to_s
|
36
|
+
# Else init from a database object/document.
|
37
|
+
else
|
38
|
+
obj = url_or_obj
|
39
|
+
assert_respond_to(obj, [:fetch, :[]])
|
40
|
+
|
41
|
+
url = obj.fetch("url") # Should always be present.
|
42
|
+
crawled = obj.fetch("crawled", false)
|
43
|
+
date_crawled = obj["date_crawled"]
|
89
44
|
end
|
90
|
-
|
91
|
-
|
92
|
-
|
45
|
+
|
46
|
+
@uri = URI(url)
|
47
|
+
@crawled = crawled
|
48
|
+
@date_crawled = date_crawled
|
49
|
+
|
50
|
+
super(url)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Raises an exception if url is not a valid HTTP URL.
|
54
|
+
#
|
55
|
+
# @param url [Wgit::Url, String] The Url to validate.
|
56
|
+
# @raise [RuntimeError] If url is invalid.
|
57
|
+
def self.validate(url)
|
58
|
+
if Wgit::Url.relative_link?(url)
|
59
|
+
raise "Invalid url (or a relative link): #{url}"
|
93
60
|
end
|
94
|
-
|
95
|
-
|
96
|
-
@crawled = bool
|
97
|
-
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
61
|
+
unless url.start_with?("http://") or url.start_with?("https://")
|
62
|
+
raise "Invalid url (missing protocol prefix): #{url}"
|
98
63
|
end
|
99
|
-
|
100
|
-
|
101
|
-
@uri
|
64
|
+
if URI.regexp.match(url).nil?
|
65
|
+
raise "Invalid url: #{url}"
|
102
66
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
67
|
+
end
|
68
|
+
|
69
|
+
# Determines if the Url is valid or not.
|
70
|
+
#
|
71
|
+
# @param url [Wgit::Url, String] The Url to validate.
|
72
|
+
# @return [Boolean] True if valid, otherwise false.
|
73
|
+
def self.valid?(url)
|
74
|
+
Wgit::Url.validate(url)
|
75
|
+
true
|
76
|
+
rescue
|
77
|
+
false
|
78
|
+
end
|
79
|
+
|
80
|
+
# Modifies the receiver url by prefixing it with a protocol.
|
81
|
+
# Returns the url whether its been modified or not.
|
82
|
+
# The default protocol prefix is http://.
|
83
|
+
#
|
84
|
+
# @param url [Wgit::Url, String] The url to be prefixed with a protocol.
|
85
|
+
# @param https [Boolean] Whether the protocol prefix is https or http.
|
86
|
+
# @return [Wgit::Url] The url with a protocol prefix.
|
87
|
+
def self.prefix_protocol(url, https = false)
|
88
|
+
unless url.start_with?("http://") or url.start_with?("https://")
|
89
|
+
if https
|
90
|
+
url.replace("https://#{url}")
|
91
|
+
else
|
92
|
+
url.replace("http://#{url}")
|
93
|
+
end
|
106
94
|
end
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
95
|
+
url
|
96
|
+
end
|
97
|
+
|
98
|
+
# Returns if link is a relative or absolute Url. How it works:
|
99
|
+
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
100
|
+
# array[2]: "www.google.co.uk", array[5]: "/about.html".
|
101
|
+
# This means that all external links in a page are expected to have a
|
102
|
+
# protocol prefix e.g. "http://", otherwise the link is treated as an
|
103
|
+
# internal link (regardless of whether it is valid or not).
|
104
|
+
#
|
105
|
+
# @param link [Wgit::Url, String] The url to test if relative or not.
|
106
|
+
# @return [Boolean] True if relative, false if absolute.
|
107
|
+
# @raise [RuntimeError] If the link is invalid.
|
108
|
+
def self.relative_link?(link)
|
109
|
+
link_segs = URI.split(link)
|
110
|
+
if not link_segs[2].nil? and not link_segs[2].empty?
|
111
|
+
false
|
112
|
+
elsif not link_segs[5].nil? and not link_segs[5].empty?
|
113
|
+
true
|
114
|
+
else
|
115
|
+
raise "Invalid link: #{link}"
|
111
116
|
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Concats the host and link Strings and returns the result.
|
120
|
+
#
|
121
|
+
# @param host [Wgit::Url, String] The Url host.
|
122
|
+
# @param link [Wgit::Url, String] The link to add to the host prefix.
|
123
|
+
# @return [Wgit::Url] host + "/" + link
|
124
|
+
def self.concat(host, link)
|
125
|
+
url = host
|
126
|
+
url.chop! if url.end_with?("/")
|
127
|
+
link = link[1..-1] if link.start_with?("/")
|
128
|
+
Wgit::Url.new(url + "/" + link)
|
129
|
+
end
|
130
|
+
|
131
|
+
# Returns if self is a relative or absolute Url.
|
132
|
+
# @return [Boolean] True if relative, false if absolute.
|
133
|
+
# @raise [RuntimeError] If the link is invalid.
|
134
|
+
def relative_link?
|
135
|
+
Wgit::Url.relative_link?(self)
|
136
|
+
end
|
137
|
+
|
138
|
+
# Determines if self is a valid Url or not.
|
139
|
+
#
|
140
|
+
# @return [Boolean] True if valid, otherwise false.
|
141
|
+
def valid?
|
142
|
+
Wgit::Url.valid?(self)
|
143
|
+
end
|
144
|
+
|
145
|
+
# Concats self (Url) and the link.
|
146
|
+
#
|
147
|
+
# @param link [Wgit::Url, String] The link to concat with self.
|
148
|
+
# @return [Wgit::Url] self + "/" + link
|
149
|
+
def concat(link)
|
150
|
+
Wgit::Url.concat(self, link)
|
151
|
+
end
|
152
|
+
|
153
|
+
# Sets the @crawled instance var, also setting @date_crawled to the
|
154
|
+
# current time or nil (depending on the bool value).
|
155
|
+
#
|
156
|
+
# @param bool [Boolean] True if self has been crawled, false otherwise.
|
157
|
+
def crawled=(bool)
|
158
|
+
@crawled = bool
|
159
|
+
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns the @uri instance var of this URL.
|
163
|
+
#
|
164
|
+
# @return [URI::HTTP, URI::HTTPS] The URI object of self.
|
165
|
+
def to_uri
|
166
|
+
@uri
|
167
|
+
end
|
112
168
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
169
|
+
# Returns self.
|
170
|
+
#
|
171
|
+
# @return [Wgit::Url] This (self) Url.
|
172
|
+
def to_url
|
173
|
+
self
|
174
|
+
end
|
175
|
+
|
176
|
+
# Returns a new Wgit::Url containing just the host of this URL e.g.
|
177
|
+
# Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
|
178
|
+
#
|
179
|
+
# @return [Wgit::Url] Containing just the host.
|
180
|
+
def to_host
|
181
|
+
Wgit::Url.new(@uri.host)
|
182
|
+
end
|
183
|
+
|
184
|
+
# Returns the base of this URL e.g. the protocol and host combined.
|
185
|
+
# How it works:
|
186
|
+
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
187
|
+
# array[0]: "http://", array[2]: "www.google.co.uk", which we use.
|
188
|
+
#
|
189
|
+
# @return [Wgit::Url] Base of self (Url) e.g. http://www.google.co.uk.
|
190
|
+
def to_base
|
191
|
+
if Wgit::Url.relative_link?(self)
|
192
|
+
raise "A relative link doesn't have a base URL: #{self}"
|
126
193
|
end
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
h = Wgit::Utils.to_h(self, ignore)
|
131
|
-
Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
|
194
|
+
url_segs = URI.split(self)
|
195
|
+
if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
|
196
|
+
raise "Both a protocol and host are needed: #{self}"
|
132
197
|
end
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
198
|
+
base = "#{url_segs[0]}://#{url_segs[2]}"
|
199
|
+
Wgit::Url.new(base)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns a Hash containing this Url's instance vars excluding @uri.
|
203
|
+
# Used when storing the URL in a Database e.g. MongoDB etc.
|
204
|
+
#
|
205
|
+
# @return [Hash] self's instance vars as a Hash.
|
206
|
+
def to_h
|
207
|
+
ignore = ["@uri"]
|
208
|
+
h = Wgit::Utils.to_h(self, ignore)
|
209
|
+
Hash[h.to_a.insert(0, ["url", self])] # Insert url at position 0.
|
210
|
+
end
|
211
|
+
|
212
|
+
alias :to_hash :to_h
|
213
|
+
alias :host :to_host
|
214
|
+
alias :base :to_base
|
215
|
+
alias :internal_link? :relative_link?
|
216
|
+
alias :crawled? :crawled
|
139
217
|
end
|
140
218
|
end
|