wgit 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/wgit.rb +1 -1
- data/lib/wgit/assertable.rb +72 -61
- data/lib/wgit/core_ext.rb +11 -5
- data/lib/wgit/crawler.rb +97 -57
- data/lib/wgit/database/database.rb +247 -170
- data/lib/wgit/database/model.rb +40 -24
- data/lib/wgit/database/mongo_connection_details.rb +44 -23
- data/lib/wgit/document.rb +534 -233
- data/lib/wgit/indexer.rb +235 -0
- data/lib/wgit/url.rb +199 -121
- data/lib/wgit/utils.rb +143 -96
- data/lib/wgit/version.rb +5 -1
- metadata +10 -9
- data/lib/wgit/web_crawler.rb +0 -134
data/lib/wgit/indexer.rb
ADDED
@@ -0,0 +1,235 @@
|
|
1
|
+
require_relative 'crawler'
|
2
|
+
require_relative 'database/database'
|
3
|
+
|
4
|
+
module Wgit
|
5
|
+
|
6
|
+
# Convience method to index the World Wide Web using
|
7
|
+
# Wgit::Indexer#index_the_web.
|
8
|
+
#
|
9
|
+
# Retrieves uncrawled url's from the database and recursively crawls each
|
10
|
+
# site storing their internal pages into the database and adding their
|
11
|
+
# external url's to be crawled at a later date. Puts out info on the crawl
|
12
|
+
# to STDOUT as it goes along.
|
13
|
+
#
|
14
|
+
# @param max_sites_to_crawl [Integer] The number of separate and whole
|
15
|
+
# websites to be crawled before the method exits. Defaults to -1 which
|
16
|
+
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
17
|
+
# @param max_data_size [Integer] The maximum amount of bytes that will be
|
18
|
+
# scraped from the web (default is 1GB). Note, that this value is used to
|
19
|
+
# determine when to stop crawling; it's not a guarantee of the max data
|
20
|
+
# that will be obtained.
|
21
|
+
def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
22
|
+
db = Wgit::Database.new
|
23
|
+
indexer = Wgit::Indexer.new(db)
|
24
|
+
indexer.index_the_web(max_sites_to_crawl, max_data_size)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Convience method to index a single website using
|
28
|
+
# Wgit::Indexer#index_this_site.
|
29
|
+
#
|
30
|
+
# Crawls a single website's pages and stores them into the database.
|
31
|
+
# There is no max download limit so be careful which sites you index.
|
32
|
+
#
|
33
|
+
# @param url [Wgit::Url, String] The base Url of the website to crawl.
|
34
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
35
|
+
# external Url's into the database.
|
36
|
+
# @yield [doc] Given the Wgit::Document of each crawled web page, before it
|
37
|
+
# is inserted into the database allowing for prior manipulation.
|
38
|
+
# @return [Integer] The total number of pages crawled within the website.
|
39
|
+
def self.index_this_site(url, insert_externals = true, &block)
|
40
|
+
url = Wgit::Url.new url
|
41
|
+
db = Wgit::Database.new
|
42
|
+
indexer = Wgit::Indexer.new(db)
|
43
|
+
indexer.index_this_site(url, insert_externals, &block)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Performs a search of the database's indexed documents and pretty prints
|
47
|
+
# the results. See Wgit::Database#search for details of the search.
|
48
|
+
#
|
49
|
+
# @param query [String] The text query to search with.
|
50
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
51
|
+
# for separately.
|
52
|
+
# @param limit [Integer] The max number of results to return.
|
53
|
+
# @param skip [Integer] The number of DB records to skip.
|
54
|
+
# @param sentence_length [Integer] The max length of each result's text
|
55
|
+
# snippet.
|
56
|
+
# @yield [doc] Given each search result (Wgit::Document).
|
57
|
+
def self.indexed_search(query, whole_sentence = false, limit = 10,
|
58
|
+
skip = 0, sentence_length = 80, &block)
|
59
|
+
db = Wgit::Database.new
|
60
|
+
results = db.search(query, whole_sentence, limit, skip, &block)
|
61
|
+
Wgit::Utils.printf_search_results(results, query, false, sentence_length)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Class which sets up a crawler and saves the indexed docs to a database.
|
65
|
+
class Indexer
|
66
|
+
|
67
|
+
# The crawler used to scrape the WWW.
|
68
|
+
attr_reader :crawler
|
69
|
+
|
70
|
+
# The database instance used to store Urls and Documents in.
|
71
|
+
attr_reader :db
|
72
|
+
|
73
|
+
# Initialize the Indexer.
|
74
|
+
#
|
75
|
+
# @param database [Wgit::Database] The database instance (already
|
76
|
+
# initialized with the correct connection details etc).
|
77
|
+
def initialize(database)
|
78
|
+
@crawler = Wgit::Crawler.new
|
79
|
+
@db = database
|
80
|
+
end
|
81
|
+
|
82
|
+
# Retrieves uncrawled url's from the database and recursively crawls each
|
83
|
+
# site storing their internal pages into the database and adding their
|
84
|
+
# external url's to be crawled at a later date. Puts out info on the crawl
|
85
|
+
# to STDOUT as it goes along.
|
86
|
+
#
|
87
|
+
# @param max_sites_to_crawl [Integer] The number of separate and whole
|
88
|
+
# websites to be crawled before the method exits. Defaults to -1 which
|
89
|
+
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
90
|
+
# @param max_data_size [Integer] The maximum amount of bytes that will be
|
91
|
+
# scraped from the web (default is 1GB). Note, that this value is used to
|
92
|
+
# determine when to stop crawling; it's not a guarantee of the max data
|
93
|
+
# that will be obtained.
|
94
|
+
def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
95
|
+
if max_sites_to_crawl < 0
|
96
|
+
puts "Indexing until the database has been filled or it runs out of \
|
97
|
+
urls to crawl (which might be never)."
|
98
|
+
end
|
99
|
+
site_count = 0
|
100
|
+
|
101
|
+
while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
|
102
|
+
puts "Current database size: #{@db.size}"
|
103
|
+
@crawler.urls = @db.uncrawled_urls
|
104
|
+
|
105
|
+
if @crawler.urls.empty?
|
106
|
+
puts "No urls to crawl, exiting."
|
107
|
+
return
|
108
|
+
end
|
109
|
+
puts "Starting crawl loop for: #{@crawler.urls}"
|
110
|
+
|
111
|
+
docs_count = 0
|
112
|
+
urls_count = 0
|
113
|
+
|
114
|
+
@crawler.urls.each do |url|
|
115
|
+
unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
|
116
|
+
puts "Reached max number of sites to crawl or database \
|
117
|
+
capacity, exiting."
|
118
|
+
return
|
119
|
+
end
|
120
|
+
site_count += 1
|
121
|
+
|
122
|
+
url.crawled = true
|
123
|
+
raise unless @db.update(url) == 1
|
124
|
+
|
125
|
+
site_docs_count = 0
|
126
|
+
ext_links = @crawler.crawl_site(url) do |doc|
|
127
|
+
unless doc.empty?
|
128
|
+
if write_doc_to_db(doc)
|
129
|
+
docs_count += 1
|
130
|
+
site_docs_count += 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
urls_count += write_urls_to_db(ext_links)
|
136
|
+
puts "Crawled and saved #{site_docs_count} docs for the \
|
137
|
+
site: #{url}"
|
138
|
+
end
|
139
|
+
|
140
|
+
puts "Crawled and saved docs for #{docs_count} url(s) overall for \
|
141
|
+
this iteration."
|
142
|
+
puts "Found and saved #{urls_count} external url(s) for the next \
|
143
|
+
iteration."
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Crawls a single website's pages and stores them into the database.
|
148
|
+
# There is no max download limit so be careful which sites you index.
|
149
|
+
# Puts out info on the crawl to STDOUT as it goes along.
|
150
|
+
#
|
151
|
+
# @param url [Wgit::Url] The base Url of the website to crawl.
|
152
|
+
# @param insert_externals [Boolean] Whether or not to insert the website's
|
153
|
+
# external Url's into the database.
|
154
|
+
# @yield [doc] Given the Wgit::Document of each crawled web page, before it
|
155
|
+
# is inserted into the database allowing for prior manipulation. Return
|
156
|
+
# nil or false from the block to prevent the document from being saved
|
157
|
+
# into the database.
|
158
|
+
# @return [Integer] The total number of webpages/documents indexed.
|
159
|
+
def index_this_site(url, insert_externals = true)
|
160
|
+
total_pages_indexed = 0
|
161
|
+
|
162
|
+
ext_urls = @crawler.crawl_site(url) do |doc|
|
163
|
+
result = true
|
164
|
+
if block_given?
|
165
|
+
result = yield(doc)
|
166
|
+
end
|
167
|
+
|
168
|
+
if result
|
169
|
+
if write_doc_to_db(doc)
|
170
|
+
total_pages_indexed += 1
|
171
|
+
puts "Crawled and saved internal page: #{doc.url}"
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
url.crawled = true
|
177
|
+
if !@db.url?(url)
|
178
|
+
@db.insert(url)
|
179
|
+
else
|
180
|
+
@db.update(url)
|
181
|
+
end
|
182
|
+
|
183
|
+
if insert_externals
|
184
|
+
write_urls_to_db(ext_urls)
|
185
|
+
puts "Found and saved #{ext_urls.length} external url(s)"
|
186
|
+
end
|
187
|
+
|
188
|
+
puts "Crawled and saved #{total_pages_indexed} docs for the \
|
189
|
+
site: #{url}"
|
190
|
+
|
191
|
+
total_pages_indexed
|
192
|
+
end
|
193
|
+
|
194
|
+
private
|
195
|
+
|
196
|
+
# Keep crawling or not based on DB size and current loop iteration.
|
197
|
+
def keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
|
198
|
+
return false if @db.size >= max_data_size
|
199
|
+
# If max_sites_to_crawl is -1 for example then crawl away.
|
200
|
+
if max_sites_to_crawl < 0
|
201
|
+
true
|
202
|
+
else
|
203
|
+
site_count < max_sites_to_crawl
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
# The unique url index on the documents collection prevents duplicate
|
208
|
+
# inserts.
|
209
|
+
def write_doc_to_db(doc)
|
210
|
+
@db.insert(doc)
|
211
|
+
puts "Saved document for url: #{doc.url}"
|
212
|
+
true
|
213
|
+
rescue Mongo::Error::OperationFailure
|
214
|
+
puts "Document already exists: #{doc.url}"
|
215
|
+
false
|
216
|
+
end
|
217
|
+
|
218
|
+
# The unique url index on the urls collection prevents duplicate inserts.
|
219
|
+
def write_urls_to_db(urls)
|
220
|
+
count = 0
|
221
|
+
if urls.respond_to?(:each)
|
222
|
+
urls.each do |url|
|
223
|
+
begin
|
224
|
+
@db.insert(url)
|
225
|
+
count += 1
|
226
|
+
puts "Inserted url: #{url}"
|
227
|
+
rescue Mongo::Error::OperationFailure
|
228
|
+
puts "Url already exists: #{url}"
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
count
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
data/lib/wgit/url.rb
CHANGED
@@ -1,140 +1,218 @@
|
|
1
1
|
require_relative 'utils'
|
2
|
+
require_relative 'assertable'
|
2
3
|
require 'uri'
|
3
4
|
|
4
5
|
module Wgit
|
5
6
|
|
6
|
-
# @author Michael Telford
|
7
7
|
# Class modeling a web based URL.
|
8
|
-
# Can be an internal link e.g. "about.html"
|
9
|
-
#
|
8
|
+
# Can be an internal/relative link e.g. "about.html" or a full URL
|
9
|
+
# e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri'
|
10
|
+
# internally.
|
10
11
|
class Url < String
|
11
|
-
|
12
|
+
include Assertable
|
12
13
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
rescue
|
44
|
-
false
|
45
|
-
end
|
46
|
-
|
47
|
-
# Modifies the receiver url by prefixing it with a protocol.
|
48
|
-
# Returns the url whether its been modified or not.
|
49
|
-
def self.prefix_protocol(url, https = false)
|
50
|
-
unless url.start_with?("http://") or url.start_with?("https://")
|
51
|
-
if https
|
52
|
-
url.replace("https://#{url}")
|
53
|
-
else
|
54
|
-
url.replace("http://#{url}")
|
55
|
-
end
|
56
|
-
end
|
57
|
-
url
|
58
|
-
end
|
59
|
-
|
60
|
-
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
61
|
-
# array[2]: "www.google.co.uk", array[5]: "/about.html".
|
62
|
-
# This means that all external links in a page are expected to have a
|
63
|
-
# protocol prefix e.g. "http://", otherwise the link is treated as an
|
64
|
-
# internal link (regardless of whether it is valid or not).
|
65
|
-
def self.relative_link?(link)
|
66
|
-
link_segs = URI.split(link)
|
67
|
-
if not link_segs[2].nil? and not link_segs[2].empty?
|
68
|
-
false
|
69
|
-
elsif not link_segs[5].nil? and not link_segs[5].empty?
|
70
|
-
true
|
71
|
-
else
|
72
|
-
raise "Invalid link: #{link}"
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def self.concat(host, link)
|
77
|
-
url = host
|
78
|
-
url.chop! if url.end_with?("/")
|
79
|
-
link = link[1..-1] if link.start_with?("/")
|
80
|
-
Wgit::Url.new(url + "/" + link)
|
81
|
-
end
|
82
|
-
|
83
|
-
def relative_link?
|
84
|
-
Wgit::Url.relative_link?(self)
|
85
|
-
end
|
86
|
-
|
87
|
-
def valid?
|
88
|
-
Wgit::Url.valid?(self)
|
14
|
+
# Whether or not the Url has been crawled or not.
|
15
|
+
attr_accessor :crawled
|
16
|
+
|
17
|
+
# The date which the Url was crawled.
|
18
|
+
attr_accessor :date_crawled
|
19
|
+
|
20
|
+
# Initializes a new instance of Wgit::Url which represents a web based
|
21
|
+
# HTTP URL.
|
22
|
+
#
|
23
|
+
# @param url_or_obj [String, Object#fetch#[]] Is either a String based
|
24
|
+
# URL or an object representing a Database record e.g. a MongoDB
|
25
|
+
# document/object.
|
26
|
+
# @param crawled [Boolean] Whether or not the HTML of the URL's web
|
27
|
+
# page has been scraped or not.
|
28
|
+
# @param date_crawled [Time] Should only be provided if crawled is
|
29
|
+
# true. A suitable object can be returned from
|
30
|
+
# Wgit::Utils.time_stamp.
|
31
|
+
# @raise [RuntimeError] If url_or_obj is an Object with missing methods.
|
32
|
+
def initialize(url_or_obj, crawled = false, date_crawled = nil)
|
33
|
+
# Init from a URL String.
|
34
|
+
if url_or_obj.is_a?(String)
|
35
|
+
url = url_or_obj.to_s
|
36
|
+
# Else init from a database object/document.
|
37
|
+
else
|
38
|
+
obj = url_or_obj
|
39
|
+
assert_respond_to(obj, [:fetch, :[]])
|
40
|
+
|
41
|
+
url = obj.fetch("url") # Should always be present.
|
42
|
+
crawled = obj.fetch("crawled", false)
|
43
|
+
date_crawled = obj["date_crawled"]
|
89
44
|
end
|
90
|
-
|
91
|
-
|
92
|
-
|
45
|
+
|
46
|
+
@uri = URI(url)
|
47
|
+
@crawled = crawled
|
48
|
+
@date_crawled = date_crawled
|
49
|
+
|
50
|
+
super(url)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Raises an exception if url is not a valid HTTP URL.
|
54
|
+
#
|
55
|
+
# @param url [Wgit::Url, String] The Url to validate.
|
56
|
+
# @raise [RuntimeError] If url is invalid.
|
57
|
+
def self.validate(url)
|
58
|
+
if Wgit::Url.relative_link?(url)
|
59
|
+
raise "Invalid url (or a relative link): #{url}"
|
93
60
|
end
|
94
|
-
|
95
|
-
|
96
|
-
@crawled = bool
|
97
|
-
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
61
|
+
unless url.start_with?("http://") or url.start_with?("https://")
|
62
|
+
raise "Invalid url (missing protocol prefix): #{url}"
|
98
63
|
end
|
99
|
-
|
100
|
-
|
101
|
-
@uri
|
64
|
+
if URI.regexp.match(url).nil?
|
65
|
+
raise "Invalid url: #{url}"
|
102
66
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
67
|
+
end
|
68
|
+
|
69
|
+
# Determines if the Url is valid or not.
|
70
|
+
#
|
71
|
+
# @param url [Wgit::Url, String] The Url to validate.
|
72
|
+
# @return [Boolean] True if valid, otherwise false.
|
73
|
+
def self.valid?(url)
|
74
|
+
Wgit::Url.validate(url)
|
75
|
+
true
|
76
|
+
rescue
|
77
|
+
false
|
78
|
+
end
|
79
|
+
|
80
|
+
# Modifies the receiver url by prefixing it with a protocol.
|
81
|
+
# Returns the url whether its been modified or not.
|
82
|
+
# The default protocol prefix is http://.
|
83
|
+
#
|
84
|
+
# @param url [Wgit::Url, String] The url to be prefixed with a protocol.
|
85
|
+
# @param https [Boolean] Whether the protocol prefix is https or http.
|
86
|
+
# @return [Wgit::Url] The url with a protocol prefix.
|
87
|
+
def self.prefix_protocol(url, https = false)
|
88
|
+
unless url.start_with?("http://") or url.start_with?("https://")
|
89
|
+
if https
|
90
|
+
url.replace("https://#{url}")
|
91
|
+
else
|
92
|
+
url.replace("http://#{url}")
|
93
|
+
end
|
106
94
|
end
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
95
|
+
url
|
96
|
+
end
|
97
|
+
|
98
|
+
# Returns if link is a relative or absolute Url. How it works:
|
99
|
+
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
100
|
+
# array[2]: "www.google.co.uk", array[5]: "/about.html".
|
101
|
+
# This means that all external links in a page are expected to have a
|
102
|
+
# protocol prefix e.g. "http://", otherwise the link is treated as an
|
103
|
+
# internal link (regardless of whether it is valid or not).
|
104
|
+
#
|
105
|
+
# @param link [Wgit::Url, String] The url to test if relative or not.
|
106
|
+
# @return [Boolean] True if relative, false if absolute.
|
107
|
+
# @raise [RuntimeError] If the link is invalid.
|
108
|
+
def self.relative_link?(link)
|
109
|
+
link_segs = URI.split(link)
|
110
|
+
if not link_segs[2].nil? and not link_segs[2].empty?
|
111
|
+
false
|
112
|
+
elsif not link_segs[5].nil? and not link_segs[5].empty?
|
113
|
+
true
|
114
|
+
else
|
115
|
+
raise "Invalid link: #{link}"
|
111
116
|
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Concats the host and link Strings and returns the result.
|
120
|
+
#
|
121
|
+
# @param host [Wgit::Url, String] The Url host.
|
122
|
+
# @param link [Wgit::Url, String] The link to add to the host prefix.
|
123
|
+
# @return [Wgit::Url] host + "/" + link
|
124
|
+
def self.concat(host, link)
|
125
|
+
url = host
|
126
|
+
url.chop! if url.end_with?("/")
|
127
|
+
link = link[1..-1] if link.start_with?("/")
|
128
|
+
Wgit::Url.new(url + "/" + link)
|
129
|
+
end
|
130
|
+
|
131
|
+
# Returns if self is a relative or absolute Url.
|
132
|
+
# @return [Boolean] True if relative, false if absolute.
|
133
|
+
# @raise [RuntimeError] If the link is invalid.
|
134
|
+
def relative_link?
|
135
|
+
Wgit::Url.relative_link?(self)
|
136
|
+
end
|
137
|
+
|
138
|
+
# Determines if self is a valid Url or not.
|
139
|
+
#
|
140
|
+
# @return [Boolean] True if valid, otherwise false.
|
141
|
+
def valid?
|
142
|
+
Wgit::Url.valid?(self)
|
143
|
+
end
|
144
|
+
|
145
|
+
# Concats self (Url) and the link.
|
146
|
+
#
|
147
|
+
# @param link [Wgit::Url, String] The link to concat with self.
|
148
|
+
# @return [Wgit::Url] self + "/" + link
|
149
|
+
def concat(link)
|
150
|
+
Wgit::Url.concat(self, link)
|
151
|
+
end
|
152
|
+
|
153
|
+
# Sets the @crawled instance var, also setting @date_crawled to the
|
154
|
+
# current time or nil (depending on the bool value).
|
155
|
+
#
|
156
|
+
# @param bool [Boolean] True if self has been crawled, false otherwise.
|
157
|
+
def crawled=(bool)
|
158
|
+
@crawled = bool
|
159
|
+
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns the @uri instance var of this URL.
|
163
|
+
#
|
164
|
+
# @return [URI::HTTP, URI::HTTPS] The URI object of self.
|
165
|
+
def to_uri
|
166
|
+
@uri
|
167
|
+
end
|
112
168
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
169
|
+
# Returns self.
|
170
|
+
#
|
171
|
+
# @return [Wgit::Url] This (self) Url.
|
172
|
+
def to_url
|
173
|
+
self
|
174
|
+
end
|
175
|
+
|
176
|
+
# Returns a new Wgit::Url containing just the host of this URL e.g.
|
177
|
+
# Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
|
178
|
+
#
|
179
|
+
# @return [Wgit::Url] Containing just the host.
|
180
|
+
def to_host
|
181
|
+
Wgit::Url.new(@uri.host)
|
182
|
+
end
|
183
|
+
|
184
|
+
# Returns the base of this URL e.g. the protocol and host combined.
|
185
|
+
# How it works:
|
186
|
+
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
187
|
+
# array[0]: "http://", array[2]: "www.google.co.uk", which we use.
|
188
|
+
#
|
189
|
+
# @return [Wgit::Url] Base of self (Url) e.g. http://www.google.co.uk.
|
190
|
+
def to_base
|
191
|
+
if Wgit::Url.relative_link?(self)
|
192
|
+
raise "A relative link doesn't have a base URL: #{self}"
|
126
193
|
end
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
h = Wgit::Utils.to_h(self, ignore)
|
131
|
-
Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
|
194
|
+
url_segs = URI.split(self)
|
195
|
+
if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
|
196
|
+
raise "Both a protocol and host are needed: #{self}"
|
132
197
|
end
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
198
|
+
base = "#{url_segs[0]}://#{url_segs[2]}"
|
199
|
+
Wgit::Url.new(base)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns a Hash containing this Url's instance vars excluding @uri.
|
203
|
+
# Used when storing the URL in a Database e.g. MongoDB etc.
|
204
|
+
#
|
205
|
+
# @return [Hash] self's instance vars as a Hash.
|
206
|
+
def to_h
|
207
|
+
ignore = ["@uri"]
|
208
|
+
h = Wgit::Utils.to_h(self, ignore)
|
209
|
+
Hash[h.to_a.insert(0, ["url", self])] # Insert url at position 0.
|
210
|
+
end
|
211
|
+
|
212
|
+
alias :to_hash :to_h
|
213
|
+
alias :host :to_host
|
214
|
+
alias :base :to_base
|
215
|
+
alias :internal_link? :relative_link?
|
216
|
+
alias :crawled? :crawled
|
139
217
|
end
|
140
218
|
end
|