wgit 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/url.rb CHANGED
@@ -1,278 +1,306 @@
1
- require_relative 'utils'
2
- require_relative 'assertable'
3
- require 'uri'
4
-
5
- module Wgit
6
-
7
- # Class modeling a web based URL.
8
- # Can be an internal/relative link e.g. "about.html" or a full URL
9
- # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri'
10
- # internally.
11
- class Url < String
12
- include Assertable
13
-
14
- # Whether or not the Url has been crawled or not.
15
- attr_accessor :crawled
16
-
17
- # The date which the Url was crawled.
18
- attr_accessor :date_crawled
19
-
20
- # Initializes a new instance of Wgit::Url which represents a web based
21
- # HTTP URL.
22
- #
23
- # @param url_or_obj [String, Object#fetch#[]] Is either a String based
24
- # URL or an object representing a Database record e.g. a MongoDB
25
- # document/object.
26
- # @param crawled [Boolean] Whether or not the HTML of the URL's web
27
- # page has been scraped or not.
28
- # @param date_crawled [Time] Should only be provided if crawled is
29
- # true. A suitable object can be returned from
30
- # Wgit::Utils.time_stamp.
31
- # @raise [RuntimeError] If url_or_obj is an Object with missing methods.
32
- def initialize(url_or_obj, crawled = false, date_crawled = nil)
33
- # Init from a URL String.
34
- if url_or_obj.is_a?(String)
35
- url = url_or_obj.to_s
36
- # Else init from a database object/document.
37
- else
38
- obj = url_or_obj
39
- assert_respond_to(obj, [:fetch, :[]])
40
-
41
- url = obj.fetch("url") # Should always be present.
42
- crawled = obj.fetch("crawled", false)
43
- date_crawled = obj["date_crawled"]
44
- end
45
-
46
- @uri = URI(url)
47
- @crawled = crawled
48
- @date_crawled = date_crawled
49
-
50
- super(url)
51
- end
52
-
53
- # Raises an exception if url is not a valid HTTP URL.
54
- #
55
- # @param url [Wgit::Url, String] The Url to validate.
56
- # @raise [RuntimeError] If url is invalid.
57
- def self.validate(url)
58
- if Wgit::Url.relative_link?(url)
59
- raise "Invalid url (or a relative link): #{url}"
60
- end
61
- unless url.start_with?("http://") or url.start_with?("https://")
62
- raise "Invalid url (missing protocol prefix): #{url}"
63
- end
64
- if URI.regexp.match(url).nil?
65
- raise "Invalid url: #{url}"
66
- end
67
- end
68
-
69
- # Determines if the Url is valid or not.
70
- #
71
- # @param url [Wgit::Url, String] The Url to validate.
72
- # @return [Boolean] True if valid, otherwise false.
73
- def self.valid?(url)
74
- Wgit::Url.validate(url)
75
- true
76
- rescue
77
- false
78
- end
79
-
80
- # Modifies the receiver url by prefixing it with a protocol.
81
- # Returns the url whether its been modified or not.
82
- # The default protocol prefix is http://.
83
- #
84
- # @param url [Wgit::Url, String] The url to be prefixed with a protocol.
85
- # @param https [Boolean] Whether the protocol prefix is https or http.
86
- # @return [Wgit::Url] The url with a protocol prefix.
87
- def self.prefix_protocol(url, https = false)
88
- unless url.start_with?("http://") or url.start_with?("https://")
89
- if https
90
- url.replace("https://#{url}")
91
- else
92
- url.replace("http://#{url}")
93
- end
94
- end
95
- url
96
- end
97
-
98
- # Returns if link is a relative or absolute Url.
99
- # All external links in a page are expected to have a protocol prefix e.g.
100
- # "http://", otherwise the link is treated as an internal link (regardless
101
- # of whether it is valid or not). The only exception is if base is provided
102
- # and link is a page within that site; then the link is relative.
103
- #
104
- # @param link [Wgit::Url, String] The url to test if relative or not.
105
- # @param base [String] The Url base e.g. http://www.google.co.uk.
106
- # @return [Boolean] True if relative, false if absolute.
107
- # @raise [RuntimeError] If the link is invalid.
108
- def self.relative_link?(link, base: nil)
109
- if base and URI(base).host.nil?
110
- raise "Invalid base, must contain protocol prefix: #{base}"
111
- end
112
-
113
- uri = URI(link)
114
- if not uri.host.nil? and not uri.host.empty?
115
- if base
116
- uri.host == URI(base).host
117
- else
118
- false
119
- end
120
- elsif not uri.path.nil? and not uri.path.empty?
121
- true
122
- else
123
- raise "Invalid link: #{link}"
124
- end
125
- end
126
-
127
- # Concats the host and link Strings and returns the result.
128
- #
129
- # @param host [Wgit::Url, String] The Url host.
130
- # @param link [Wgit::Url, String] The link to add to the host prefix.
131
- # @return [Wgit::Url] host + "/" + link
132
- def self.concat(host, link)
133
- url = host
134
- url.chop! if url.end_with?("/")
135
- link = link[1..-1] if link.start_with?("/")
136
- Wgit::Url.new(url + "/" + link)
137
- end
138
-
139
- # Returns if self is a relative or absolute Url. If base is provided and
140
- # self is a page within that site then the link is relative.
141
- # See Wgit.relative_link? for more information.
142
- #
143
- # @return [Boolean] True if relative, false if absolute.
144
- # @raise [RuntimeError] If the link is invalid.
145
- def relative_link?(base: nil)
146
- Wgit::Url.relative_link?(self, base: base)
147
- end
148
-
149
- # Determines if self is a valid Url or not.
150
- #
151
- # @return [Boolean] True if valid, otherwise false.
152
- def valid?
153
- Wgit::Url.valid?(self)
154
- end
155
-
156
- # Concats self and the link.
157
- #
158
- # @param link [Wgit::Url, String] The link to concat with self.
159
- # @return [Wgit::Url] self + "/" + link
160
- def concat(link)
161
- Wgit::Url.concat(self, link)
162
- end
163
-
164
- # Sets the @crawled instance var, also setting @date_crawled to the
165
- # current time or nil (depending on the bool value).
166
- #
167
- # @param bool [Boolean] True if self has been crawled, false otherwise.
168
- def crawled=(bool)
169
- @crawled = bool
170
- @date_crawled = bool ? Wgit::Utils.time_stamp : nil
171
- end
172
-
173
- # Returns the @uri instance var of this URL.
174
- #
175
- # @return [URI::HTTP, URI::HTTPS] The URI object of self.
176
- def to_uri
177
- @uri
178
- end
179
-
180
- # Returns self.
181
- #
182
- # @return [Wgit::Url] This (self) Url.
183
- def to_url
184
- self
185
- end
186
-
187
- # Returns a new Wgit::Url containing just the scheme/protocol of this URL
188
- # e.g. Given http://www.google.co.uk, http is returned.
189
- #
190
- # @return [Wgit::Url] Containing just the scheme/protocol.
191
- def to_scheme
192
- Wgit::Url.new(@uri.scheme)
193
- end
194
-
195
- # Returns a new Wgit::Url containing just the host of this URL e.g.
196
- # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
197
- #
198
- # @return [Wgit::Url] Containing just the host.
199
- def to_host
200
- Wgit::Url.new(@uri.host)
201
- end
202
-
203
- # Returns only the base of this URL e.g. the protocol and host combined.
204
- #
205
- # @return [Wgit::Url] Base of self e.g. http://www.google.co.uk.
206
- def to_base
207
- if Wgit::Url.relative_link?(self)
208
- raise "A relative link doesn't have a base URL: #{self}"
209
- end
210
- if @uri.scheme.nil? or @uri.host.nil? or @uri.host.empty?
211
- raise "Both a protocol and host are needed: #{self}"
212
- end
213
- base = "#{@uri.scheme}://#{@uri.host}"
214
- Wgit::Url.new(base)
215
- end
216
-
217
- # Returns the path of this URL e.g. the bit after the host without slashes.
218
- # For example:
219
- # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
220
- # "about.html". See Wgit::Url#to_endpoint if you want the slashes.
221
- #
222
- # @return [Wgit::Url] Path of self e.g. about.html.
223
- def to_path
224
- path = @uri.path
225
- return Wgit::Url.new(path) if path == '/'
226
- path = path[1..-1] if path.start_with?('/')
227
- path.chop! if path.end_with?('/')
228
- Wgit::Url.new(path)
229
- end
230
-
231
- # Returns the endpoint of this URL e.g. the bit after the host with any
232
- # slashes included. For example:
233
- # Wgit::Url.new("http://www.google.co.uk/about.html/").to_endpoint returns
234
- # "/about.html/". See Wgit::Url#to_path if you don't want the slashes.
235
- #
236
- # @return [Wgit::Url] Endpoint of self e.g. /about.html/.
237
- def to_endpoint
238
- endpoint = @uri.path
239
- endpoint = '/' + endpoint unless endpoint.start_with?('/')
240
- Wgit::Url.new(endpoint)
241
- end
242
-
243
- # Returns a new Wgit::Url containing just the query string of this URL
244
- # e.g. Given http://google.com?q=ruby, ruby is returned.
245
- #
246
- # @return [Wgit::Url] Containing just the query string.
247
- def to_query_string
248
- Wgit::Url.new(@uri.query)
249
- end
250
-
251
- # Returns a Hash containing this Url's instance vars excluding @uri.
252
- # Used when storing the URL in a Database e.g. MongoDB etc.
253
- #
254
- # @return [Hash] self's instance vars as a Hash.
255
- def to_h
256
- ignore = ["@uri"]
257
- h = Wgit::Utils.to_h(self, ignore)
258
- Hash[h.to_a.insert(0, ["url", self])] # Insert url at position 0.
259
- end
260
-
261
- alias :to_hash :to_h
262
- alias :uri :to_uri
263
- alias :url :to_url
264
- alias :scheme :to_scheme
265
- alias :to_protocol :to_scheme
266
- alias :protocol :to_scheme
267
- alias :host :to_host
268
- alias :base :to_base
269
- alias :path :to_path
270
- alias :endpoint :to_endpoint
271
- alias :query_string :to_query_string
272
- alias :query :to_query_string
273
- alias :internal_link? :relative_link?
274
- alias :is_relative? :relative_link?
275
- alias :is_internal? :relative_link?
276
- alias :crawled? :crawled
277
- end
278
- end
1
+ require_relative 'utils'
2
+ require_relative 'assertable'
3
+ require 'uri'
4
+
5
+ module Wgit
6
+
7
+ # Class modeling a web based URL.
8
+ # Can be an internal/relative link e.g. "about.html" or a full URL
9
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri'
10
+ # internally.
11
+ class Url < String
12
+ include Assertable
13
+
14
+ # Whether or not the Url has been crawled or not.
15
+ attr_accessor :crawled
16
+
17
+ # The date which the Url was crawled.
18
+ attr_accessor :date_crawled
19
+
20
+ # Initializes a new instance of Wgit::Url which represents a web based
21
+ # HTTP URL.
22
+ #
23
+ # @param url_or_obj [String, Object#fetch#[]] Is either a String based
24
+ # URL or an object representing a Database record e.g. a MongoDB
25
+ # document/object.
26
+ # @param crawled [Boolean] Whether or not the HTML of the URL's web
27
+ # page has been scraped or not.
28
+ # @param date_crawled [Time] Should only be provided if crawled is
29
+ # true. A suitable object can be returned from
30
+ # Wgit::Utils.time_stamp.
31
+ # @raise [RuntimeError] If url_or_obj is an Object with missing methods.
32
+ def initialize(url_or_obj, crawled = false, date_crawled = nil)
33
+ # Init from a URL String.
34
+ if url_or_obj.is_a?(String)
35
+ url = url_or_obj.to_s
36
+ # Else init from a database object/document.
37
+ else
38
+ obj = url_or_obj
39
+ assert_respond_to(obj, [:fetch, :[]])
40
+
41
+ url = obj.fetch("url") # Should always be present.
42
+ crawled = obj.fetch("crawled", false)
43
+ date_crawled = obj["date_crawled"]
44
+ end
45
+
46
+ @uri = URI(url)
47
+ @crawled = crawled
48
+ @date_crawled = date_crawled
49
+
50
+ super(url)
51
+ end
52
+
53
+ # Raises an exception if url is not a valid HTTP URL.
54
+ #
55
+ # @param url [Wgit::Url, String] The Url to validate.
56
+ # @raise [RuntimeError] If url is invalid.
57
+ def self.validate(url)
58
+ if Wgit::Url.relative_link?(url)
59
+ raise "Invalid url (or a relative link): #{url}"
60
+ end
61
+ unless url.start_with?("http://") or url.start_with?("https://")
62
+ raise "Invalid url (missing protocol prefix): #{url}"
63
+ end
64
+ if URI.regexp.match(url).nil?
65
+ raise "Invalid url: #{url}"
66
+ end
67
+ end
68
+
69
+ # Determines if the Url is valid or not.
70
+ #
71
+ # @param url [Wgit::Url, String] The Url to validate.
72
+ # @return [Boolean] True if valid, otherwise false.
73
+ def self.valid?(url)
74
+ Wgit::Url.validate(url)
75
+ true
76
+ rescue
77
+ false
78
+ end
79
+
80
+ # Modifies the receiver url by prefixing it with a protocol.
81
+ # Returns the url whether its been modified or not.
82
+ # The default protocol prefix is http://.
83
+ #
84
+ # @param url [Wgit::Url, String] The url to be prefixed with a protocol.
85
+ # @param https [Boolean] Whether the protocol prefix is https or http.
86
+ # @return [Wgit::Url] The url with a protocol prefix.
87
+ def self.prefix_protocol(url, https = false)
88
+ unless url.start_with?("http://") or url.start_with?("https://")
89
+ if https
90
+ url.replace("https://#{url}")
91
+ else
92
+ url.replace("http://#{url}")
93
+ end
94
+ end
95
+ url
96
+ end
97
+
98
+ # Returns if link is a relative or absolute Url.
99
+ # All external links in a page are expected to have a protocol prefix e.g.
100
+ # "http://", otherwise the link is treated as an internal link (regardless
101
+ # of whether it is valid or not). The only exception is if base is provided
102
+ # and link is a page within that site; then the link is relative.
103
+ #
104
+ # @param link [Wgit::Url, String] The url to test if relative or not.
105
+ # @param base [String] The Url base e.g. http://www.google.co.uk.
106
+ # @return [Boolean] True if relative, false if absolute.
107
+ # @raise [RuntimeError] If the link is invalid.
108
+ def self.relative_link?(link, base: nil)
109
+ raise "Invalid link: #{link}" if link.nil? or link.empty?
110
+ if base and URI(base).host.nil?
111
+ raise "Invalid base, must contain protocol prefix: #{base}"
112
+ end
113
+
114
+ uri = URI(link)
115
+ if uri.relative?
116
+ true
117
+ else
118
+ base ? uri.host == URI(base).host : false
119
+ end
120
+ end
121
+
122
+ # Concats the host and link Strings and returns the result.
123
+ #
124
+ # @param host [Wgit::Url, String] The Url host.
125
+ # @param link [Wgit::Url, String] The link to add to the host prefix.
126
+ # @return [Wgit::Url] host + "/" + link
127
+ def self.concat(host, link)
128
+ url = host
129
+ url.chop! if url.end_with?('/')
130
+ link = link[1..-1] if link.start_with?('/')
131
+ separator = link.start_with?('#') ? '' : '/'
132
+ Wgit::Url.new(url + separator + link)
133
+ end
134
+
135
+ # Returns if self is a relative or absolute Url. If base is provided and
136
+ # self is a page within that site then the link is relative.
137
+ # See Wgit.relative_link? for more information.
138
+ #
139
+ # @return [Boolean] True if relative, false if absolute.
140
+ # @raise [RuntimeError] If the link is invalid.
141
+ def relative_link?(base: nil)
142
+ Wgit::Url.relative_link?(self, base: base)
143
+ end
144
+
145
+ # Determines if self is a valid Url or not.
146
+ #
147
+ # @return [Boolean] True if valid, otherwise false.
148
+ def valid?
149
+ Wgit::Url.valid?(self)
150
+ end
151
+
152
+ # Concats self and the link.
153
+ #
154
+ # @param link [Wgit::Url, String] The link to concat with self.
155
+ # @return [Wgit::Url] self + "/" + link
156
+ def concat(link)
157
+ Wgit::Url.concat(self, link)
158
+ end
159
+
160
+ # Sets the @crawled instance var, also setting @date_crawled to the
161
+ # current time or nil (depending on the bool value).
162
+ #
163
+ # @param bool [Boolean] True if self has been crawled, false otherwise.
164
+ def crawled=(bool)
165
+ @crawled = bool
166
+ @date_crawled = bool ? Wgit::Utils.time_stamp : nil
167
+ end
168
+
169
+ # Returns the @uri instance var of this URL.
170
+ #
171
+ # @return [URI::HTTP, URI::HTTPS] The URI object of self.
172
+ def to_uri
173
+ @uri
174
+ end
175
+
176
+ # Returns self.
177
+ #
178
+ # @return [Wgit::Url] This (self) Url.
179
+ def to_url
180
+ self
181
+ end
182
+
183
+ # Returns a new Wgit::Url containing just the scheme/protocol of this URL
184
+ # e.g. Given http://www.google.co.uk, http is returned.
185
+ #
186
+ # @return [Wgit::Url, nil] Containing just the scheme/protocol or nil.
187
+ def to_scheme
188
+ scheme = @uri.scheme
189
+ scheme ? Wgit::Url.new(scheme) : nil
190
+ end
191
+
192
+ # Returns a new Wgit::Url containing just the host of this URL e.g.
193
+ # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
194
+ #
195
+ # @return [Wgit::Url, nil] Containing just the host or nil.
196
+ def to_host
197
+ host = @uri.host
198
+ host ? Wgit::Url.new(host) : nil
199
+ end
200
+
201
+ # Returns only the base of this URL e.g. the protocol and host combined.
202
+ #
203
+ # @return [Wgit::Url, nil] Base of self e.g. http://www.google.co.uk or nil.
204
+ def to_base
205
+ return nil if @uri.scheme.nil? or @uri.host.nil?
206
+ base = "#{@uri.scheme}://#{@uri.host}"
207
+ Wgit::Url.new(base)
208
+ end
209
+
210
+ # Returns the path of this URL e.g. the bit after the host without slashes.
211
+ # For example:
212
+ # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
213
+ # "about.html". See Wgit::Url#to_endpoint if you want the slashes.
214
+ #
215
+ # @return [Wgit::Url, nil] Path of self e.g. about.html or nil.
216
+ def to_path
217
+ path = @uri.path
218
+ return nil if path.nil? or path.empty?
219
+ return Wgit::Url.new('/') if path == '/'
220
+ path = path[1..-1] if path.start_with?('/')
221
+ path.chop! if path.end_with?('/')
222
+ Wgit::Url.new(path)
223
+ end
224
+
225
+ # Returns the endpoint of this URL e.g. the bit after the host with any
226
+ # slashes included. For example:
227
+ # Wgit::Url.new("http://www.google.co.uk/about.html/").to_endpoint returns
228
+ # "/about.html/". See Wgit::Url#to_path if you don't want the slashes.
229
+ #
230
+ # @return [Wgit::Url] Endpoint of self e.g. /about.html/. For a URL without
231
+ # an endpoint, / is returned.
232
+ def to_endpoint
233
+ endpoint = @uri.path
234
+ endpoint = '/' + endpoint unless endpoint.start_with?('/')
235
+ Wgit::Url.new(endpoint)
236
+ end
237
+
238
+ # Returns a new Wgit::Url containing just the query string of this URL
239
+ # e.g. Given http://google.com?q=ruby, 'ruby' is returned.
240
+ #
241
+ # @return [Wgit::Url, nil] Containing just the query string or nil.
242
+ def to_query_string
243
+ query = @uri.query
244
+ query ? Wgit::Url.new(query) : nil
245
+ end
246
+
247
+ # Returns a new Wgit::Url containing just the anchor string of this URL
248
+ # e.g. Given http://google.com#about, #about is returned.
249
+ #
250
+ # @return [Wgit::Url, nil] Containing just the anchor string or nil.
251
+ def to_anchor
252
+ anchor = @uri.fragment
253
+ anchor ? Wgit::Url.new("##{anchor}") : nil
254
+ end
255
+
256
+ # Returns a new Wgit::Url containing just the path + anchor string of this
257
+ # URL e.g. Given http://google.com/us#about, us#about is returned.
258
+ #
259
+ # @return [Wgit::Url, nil] Containing just the path and anchor string or
260
+ # nil.
261
+ def to_path_and_anchor
262
+ path = to_path || ''
263
+ anchor = to_anchor || ''
264
+ both = path + anchor
265
+ both.empty? ? nil : Wgit::Url.new(both)
266
+ end
267
+
268
+ # Returns a new Wgit::Url containing self without a trailing slash. Is
269
+ # idempotent.
270
+ #
271
+ # @return [Wgit::Url] Without a trailing slash.
272
+ def without_trailing_slash
273
+ end_with?('/') ? Wgit::Url.new(chop) : self
274
+ end
275
+
276
+ # Returns a Hash containing this Url's instance vars excluding @uri.
277
+ # Used when storing the URL in a Database e.g. MongoDB etc.
278
+ #
279
+ # @return [Hash] self's instance vars as a Hash.
280
+ def to_h
281
+ ignore = ["@uri"]
282
+ h = Wgit::Utils.to_h(self, ignore)
283
+ Hash[h.to_a.insert(0, ["url", self])] # Insert url at position 0.
284
+ end
285
+
286
+ alias :to_hash :to_h
287
+ alias :uri :to_uri
288
+ alias :url :to_url
289
+ alias :scheme :to_scheme
290
+ alias :to_protocol :to_scheme
291
+ alias :protocol :to_scheme
292
+ alias :host :to_host
293
+ alias :base :to_base
294
+ alias :path :to_path
295
+ alias :endpoint :to_endpoint
296
+ alias :query_string :to_query_string
297
+ alias :query :to_query_string
298
+ alias :anchor :to_anchor
299
+ alias :to_fragment :to_anchor
300
+ alias :fragment :to_anchor
301
+ alias :internal_link? :relative_link?
302
+ alias :is_relative? :relative_link?
303
+ alias :is_internal? :relative_link?
304
+ alias :crawled? :crawled
305
+ end
306
+ end
data/lib/wgit/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # @author Michael Telford
4
4
  module Wgit
5
5
  # The current gem version of Wgit.
6
- VERSION = "0.0.10".freeze
6
+ VERSION = "0.0.11".freeze
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
@@ -142,14 +142,14 @@ dependencies:
142
142
  requirements:
143
143
  - - "~>"
144
144
  - !ruby/object:Gem::Version
145
- version: '1.10'
145
+ version: 1.10.3
146
146
  type: :runtime
147
147
  prerelease: false
148
148
  version_requirements: !ruby/object:Gem::Requirement
149
149
  requirements:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
- version: '1.10'
152
+ version: 1.10.3
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: mongo
155
155
  requirement: !ruby/object:Gem::Requirement
@@ -189,6 +189,9 @@ files:
189
189
  - "./lib/wgit/url.rb"
190
190
  - "./lib/wgit/utils.rb"
191
191
  - "./lib/wgit/version.rb"
192
+ - LICENSE.txt
193
+ - README.md
194
+ - TODO.txt
192
195
  homepage: https://github.com/michaeltelford/wgit
193
196
  licenses:
194
197
  - MIT