wgit 0.10.8 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/url.rb CHANGED
@@ -28,6 +28,9 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
+ # Record the redirects from the initial Url to the final Url.
32
+ attr_reader :redirects
33
+
31
34
  # Initializes a new instance of Wgit::Url which models a web based
32
35
  # HTTP URL.
33
36
  #
@@ -57,12 +60,14 @@ module Wgit
57
60
  crawled = obj.fetch('crawled', false)
58
61
  date_crawled = obj.fetch('date_crawled', nil)
59
62
  crawl_duration = obj.fetch('crawl_duration', nil)
63
+ redirects = obj.fetch('redirects', {})
60
64
  end
61
65
 
62
66
  @uri = Addressable::URI.parse(url)
63
67
  @crawled = crawled
64
68
  @date_crawled = date_crawled
65
69
  @crawl_duration = crawl_duration
70
+ @redirects = redirects || {}
66
71
 
67
72
  super(url)
68
73
  end
@@ -107,16 +112,6 @@ Addressable::URI::InvalidURIError")
107
112
  nil
108
113
  end
109
114
 
110
- # Sets the @crawled instance var, also setting @date_crawled for
111
- # convenience.
112
- #
113
- # @param bool [Boolean] True if this Url has been crawled, false otherwise.
114
- # @return [Boolean] The value of bool having been set.
115
- def crawled=(bool)
116
- @crawled = bool
117
- @date_crawled = bool ? Wgit::Utils.time_stamp : nil
118
- end
119
-
120
115
  # Overrides String#inspect to distingiush this Url from a String.
121
116
  #
122
117
  # @return [String] A short textual representation of this Url.
@@ -134,6 +129,71 @@ Addressable::URI::InvalidURIError")
134
129
  super(new_url)
135
130
  end
136
131
 
132
+ # Overrides String#concat which oddly returns a Wgit::Url object, and
133
+ # instead returns a String. Therefore this method works the same as if
134
+ # you call String#concat, or its alias String#+, which is desired for
135
+ # this method. If you want to join two Urls, use Wgit::Url#join method.
136
+ #
137
+ # @param other [String] The String to concat onto this one.
138
+ # @return [String] The new concatted String, not a Wgit::Url.
139
+ def concat(other)
140
+ to_s.concat(other.to_s)
141
+ end
142
+
143
+ # Sets the @crawled instance var, also setting @date_crawled for
144
+ # convenience.
145
+ #
146
+ # @param bool [Boolean] True if this Url has been crawled, false otherwise.
147
+ # @return [Boolean] The value of bool having been set.
148
+ def crawled=(bool)
149
+ @crawled = bool
150
+ @date_crawled = bool ? Wgit::Utils.time_stamp : nil
151
+ end
152
+
153
+ # Sets the @redirects instance var, mapping any Strings into Wgit::Urls.
154
+ #
155
+ # @param redirects [Hash] The redirects Hash to set for this Url.
156
+ def redirects=(redirects)
157
+ assert_type(redirects, Hash)
158
+
159
+ map_to_url = proc do |url|
160
+ Wgit::Url.new(url.to_s, crawled: @crawled, date_crawled: @date_crawled)
161
+ end
162
+
163
+ @redirects = redirects
164
+ .map { |from, to| [map_to_url.call(from), map_to_url.call(to)] }
165
+ .to_h
166
+ end
167
+
168
+ # Returns the Wgit::Url's starting with the originally requested Url to be
169
+ # crawled, followed by each redirected to Url, finishing with the final
170
+ # crawled Url e.g.
171
+ #
172
+ # Example Url redirects journey (dictated by the webserver):
173
+ #
174
+ # ```
175
+ # http://example.com => 301 to https://example.com
176
+ # https://example.com => 301 to https://example.com/
177
+ # https://example.com/ => 200 OK (no more redirects, crawl complete)
178
+ # ```
179
+ #
180
+ # Would return an Array of Wgit::Url's in the form of:
181
+ #
182
+ # ```
183
+ # %w(
184
+ # http://example.com
185
+ # https://example.com
186
+ # https://example.com/
187
+ # )
188
+ # ```
189
+ #
190
+ # @return [Array<Wgit::Url>] Each redirected to Url's finishing with the
191
+ # final (successfully) crawled Url. If no redirects took place, then just
192
+ # the originally requested Url is returned inside the Array.
193
+ def redirects_journey
194
+ [redirects.keys, self].flatten
195
+ end
196
+
137
197
  # Returns true if self is a relative Url; false if absolute.
138
198
  #
139
199
  # An absolute URL must have a scheme prefix e.g.
@@ -170,7 +230,7 @@ Addressable::URI::InvalidURIError")
170
230
  raise 'Url (self) cannot be empty' if empty?
171
231
 
172
232
  return false if scheme_relative?
173
- return true if @uri.relative?
233
+ return true if @uri.relative?
174
234
 
175
235
  # Self is absolute but may be relative to the opts param e.g. host.
176
236
  opts.select! { |_k, v| v }
@@ -226,22 +286,23 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
226
286
  !valid?
227
287
  end
228
288
 
229
- # Concats self and other together before returning a new Url. Self is not
230
- # modified.
289
+ # Joins self and other together before returning a new Url. Self is not
290
+ # modified. Some magic occurs depending on what is being joined, see
291
+ # the source code for more information.
231
292
  #
232
- # @param other [Wgit::Url, String] The other to concat to the end of self.
293
+ # @param other [Wgit::Url, String] The other (relative) Url to join to the
294
+ # end of self.
233
295
  # @return [Wgit::Url] self + separator + other, separator depends on other.
234
- def concat(other)
296
+ def join(other)
235
297
  other = Wgit::Url.new(other)
236
298
  raise 'other must be relative' unless other.relative?
237
299
 
238
300
  other = other.omit_leading_slash
239
301
  separator = %w[# ? .].include?(other[0]) ? '' : '/'
302
+ separator = '' if end_with?('/')
303
+ joined = self + separator + other
240
304
 
241
- # We use to_s below to call String#+, not Wgit::Url#+ (alias for concat).
242
- concatted = omit_trailing_slash.to_s + separator.to_s + other.to_s
243
-
244
- Wgit::Url.new(concatted)
305
+ Wgit::Url.new(joined)
245
306
  end
246
307
 
247
308
  # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
@@ -257,7 +318,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
257
318
  #
258
319
  # If self is absolute then it's returned as is, making this method
259
320
  # idempotent. The doc's `<base>` element is used if present, otherwise
260
- # `doc.url` is used as the base; which is concatted with self.
321
+ # `doc.url` is used as the base; which is joined with self.
261
322
  #
262
323
  # Typically used to build an absolute link obtained from a document.
263
324
  #
@@ -267,7 +328,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
267
328
  #
268
329
  # link.make_absolute(doc) # => "http://example.com/favicon.png"
269
330
  #
270
- # @param doc [Wgit::Document] The doc whose base Url is concatted with
331
+ # @param doc [Wgit::Document] The doc whose base Url is joined with
271
332
  # self.
272
333
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
273
334
  # raises an Exception.
@@ -279,7 +340,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
279
340
 
280
341
  return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
281
342
 
282
- absolute? ? self : doc.base_url(link: self).concat(self)
343
+ absolute? ? self : doc.base_url(link: self).join(self)
283
344
  end
284
345
 
285
346
  # Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
@@ -427,7 +488,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
427
488
  return nil if path.nil? || path.empty?
428
489
  return Wgit::Url.new('/') if path == '/'
429
490
 
430
- Wgit::Url.new(path).omit_slashes
491
+ Wgit::Url.new(path).omit_leading_slash
431
492
  end
432
493
 
433
494
  # Returns the endpoint of this URL e.g. the bit after the host with any
@@ -439,7 +500,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
439
500
  # an endpoint, / is returned.
440
501
  def to_endpoint
441
502
  endpoint = @uri.path
442
- endpoint = '/' + endpoint unless endpoint.start_with?('/')
503
+ endpoint = "/#{endpoint}" unless endpoint.start_with?('/')
443
504
  Wgit::Url.new(endpoint)
444
505
  end
445
506
 
@@ -484,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
484
545
  #
485
546
  # @return [Wgit::Url, nil] Containing just the extension string or nil.
486
547
  def to_extension
487
- path = to_path
548
+ path = to_path&.omit_trailing_slash
488
549
  return nil unless path
489
550
 
490
551
  segs = path.split('.')
@@ -530,7 +591,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
530
591
  #
531
592
  # @return [Wgit::Url] Self without a trailing slash.
532
593
  def omit_leading_slash
533
- start_with?('/') ? Wgit::Url.new(self[1..-1]) : self
594
+ start_with?('/') ? Wgit::Url.new(self[1..]) : self
534
595
  end
535
596
 
536
597
  # Returns a new Wgit::Url containing self without a trailing slash. Is
@@ -564,7 +625,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
564
625
 
565
626
  return self if ['', '/'].include?(omit_base)
566
627
 
567
- Wgit::Url.new(omit_base).omit_slashes
628
+ Wgit::Url.new(omit_base).omit_leading_slash
568
629
  end
569
630
 
570
631
  # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
@@ -579,7 +640,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
579
640
 
580
641
  return self if ['', '/'].include?(omit_origin)
581
642
 
582
- Wgit::Url.new(omit_origin).omit_slashes
643
+ Wgit::Url.new(omit_origin).omit_leading_slash
583
644
  end
584
645
 
585
646
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
@@ -642,32 +703,31 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
642
703
  start_with?('//')
643
704
  end
644
705
 
645
- alias + concat
646
- alias crawled? crawled
647
- alias is_relative? relative?
648
- alias is_absolute? absolute?
649
- alias is_valid? valid?
650
- alias is_query? query?
651
- alias is_fragment? fragment?
652
- alias is_index? index?
653
- alias is_scheme_relative? scheme_relative?
654
- alias uri to_uri
655
- alias url to_url
656
- alias scheme to_scheme
657
- alias host to_host
658
- alias port to_port
659
- alias domain to_domain
660
- alias brand to_brand
661
- alias base to_base
662
- alias origin to_origin
663
- alias path to_path
664
- alias endpoint to_endpoint
665
- alias query to_query
666
- alias query_hash to_query_hash
667
- alias fragment to_fragment
668
- alias extension to_extension
669
- alias user to_user
670
- alias password to_password
671
- alias sub_domain to_sub_domain
706
+ alias_method :crawled?, :crawled
707
+ alias_method :is_relative?, :relative?
708
+ alias_method :is_absolute?, :absolute?
709
+ alias_method :is_valid?, :valid?
710
+ alias_method :is_query?, :query?
711
+ alias_method :is_fragment?, :fragment?
712
+ alias_method :is_index?, :index?
713
+ alias_method :is_scheme_relative?, :scheme_relative?
714
+ alias_method :uri, :to_uri
715
+ alias_method :url, :to_url
716
+ alias_method :scheme, :to_scheme
717
+ alias_method :host, :to_host
718
+ alias_method :port, :to_port
719
+ alias_method :domain, :to_domain
720
+ alias_method :brand, :to_brand
721
+ alias_method :base, :to_base
722
+ alias_method :origin, :to_origin
723
+ alias_method :path, :to_path
724
+ alias_method :endpoint, :to_endpoint
725
+ alias_method :query, :to_query
726
+ alias_method :query_hash, :to_query_hash
727
+ alias_method :fragment, :to_fragment
728
+ alias_method :extension, :to_extension
729
+ alias_method :user, :to_user
730
+ alias_method :password, :to_password
731
+ alias_method :sub_domain, :to_sub_domain
672
732
  end
673
733
  end
data/lib/wgit/utils.rb CHANGED
@@ -23,7 +23,7 @@ module Wgit
23
23
  obj.instance_variables.each do |var|
24
24
  next if ignore.include?(var.to_s)
25
25
 
26
- key = var.to_s[1..-1] # Remove the @ prefix.
26
+ key = var.to_s[1..] # Remove the @ prefix.
27
27
  key = key.to_sym unless use_strings_as_keys
28
28
  hash[key] = obj.instance_variable_get(var)
29
29
  end
@@ -37,9 +37,9 @@ module Wgit
37
37
  # @yield [el] Gives each element (Object) of obj_or_objects if it's
38
38
  # Enumerable, otherwise obj_or_objs itself is given.
39
39
  # @return [Object] The obj_or_objs parameter is returned.
40
- def self.each(obj_or_objs)
40
+ def self.each(obj_or_objs, &block)
41
41
  if obj_or_objs.respond_to?(:each)
42
- obj_or_objs.each { |obj| yield(obj) }
42
+ obj_or_objs.each(&block)
43
43
  else
44
44
  yield(obj_or_objs)
45
45
  end
@@ -129,15 +129,13 @@ module Wgit
129
129
  # Prints out the search results in a search engine like format.
130
130
  # The format for each result looks like:
131
131
  #
132
+ # ```
132
133
  # Title
133
- #
134
134
  # Keywords (if there are some)
135
- #
136
135
  # Text Snippet (formatted to show the searched for query, if provided)
137
- #
138
136
  # URL
139
- #
140
137
  # <empty_line_seperator>
138
+ # ```
141
139
  #
142
140
  # @param results [Array<Wgit::Document>] Array of Wgit::Document's which
143
141
  # each have had #search!(query) called (to update it's @text with the
@@ -147,7 +145,7 @@ module Wgit
147
145
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
146
  # to output text somewhere e.g. a file or STDERR.
149
147
  # @return [Integer] The number of results.
150
- def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
148
+ def self.pprint_search_results(results, keyword_limit: 5, stream: $stdout)
151
149
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
152
150
 
153
151
  results.each do |doc|
@@ -167,56 +165,111 @@ module Wgit
167
165
  end
168
166
 
169
167
  # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
- # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
- # not in the case statement will be ignored and returned as is.
168
+ # method for its type e.g. if obj.is_a? String then sanitize_str(obj) is called.
169
+ # Any type not in the case statement will be ignored and returned as is.
170
+ # Call this method if unsure what obj's type is.
172
171
  #
173
172
  # @param obj [Object] The object to be sanitized.
174
173
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
174
  # invalid characters.
176
- # @return [Object] The sanitized obj is both modified and then returned.
175
+ # @return [Object] The sanitized obj.
177
176
  def self.sanitize(obj, encode: true)
178
177
  case obj
178
+ when Wgit::Url
179
+ sanitize_url(obj, encode:)
179
180
  when String
180
- sanitize_str(obj, encode: encode)
181
+ sanitize_str(obj, encode:)
181
182
  when Array
182
- sanitize_arr(obj, encode: encode)
183
+ sanitize_arr(obj, encode:)
183
184
  else
184
185
  obj
185
186
  end
186
187
  end
187
188
 
189
+ # Sanitises a Wgit::Url to make it uniform. First sanitizes the Url as a
190
+ # String before replacing the Url value with the sanitized version. This
191
+ # method therefore modifies the given url param and also returns it.
192
+ #
193
+ # @param url [Wgit::Url] The Wgit::Url to sanitize. url is modified.
194
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
195
+ # invalid characters.
196
+ # @return [Wgit::Url] The sanitized url, which is also modified.
197
+ def self.sanitize_url(url, encode: true)
198
+ str = sanitize_str(url.to_s, encode:)
199
+ url.replace(str)
200
+ end
201
+
188
202
  # Sanitises a String to make it uniform. Strips any leading/trailing white
189
203
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
190
204
  # `encode: true`.
191
205
  #
192
- # @param str [String] The String to sanitize. str is modified.
206
+ # @param str [String] The String to sanitize. str is not modified.
193
207
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
194
208
  # invalid characters.
195
- # @return [String] The sanitized str is both modified and then returned.
209
+ # @return [String] The sanitized str.
196
210
  def self.sanitize_str(str, encode: true)
197
- if str.is_a?(String)
198
- str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
199
- str.strip!
200
- end
211
+ return str unless str.is_a?(String)
201
212
 
202
- str
213
+ str = str.encode('UTF-8', undef: :replace, invalid: :replace) if encode
214
+ str.strip
203
215
  end
204
216
 
205
217
  # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
218
  # processes non empty Strings using Wgit::Utils.sanitize and removes
207
219
  # duplicates.
208
220
  #
209
- # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
- # @return [Enumerable] The sanitized arr is both modified and then returned.
221
+ # @param arr [Enumerable] The Array to sanitize. arr is not modified.
222
+ # @return [Enumerable] The sanitized arr.
211
223
  def self.sanitize_arr(arr, encode: true)
212
- if arr.is_a?(Array)
213
- arr.map! { |str| sanitize(str, encode: encode) }
214
- arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
215
- arr.compact!
216
- arr.uniq!
217
- end
224
+ return arr unless arr.is_a?(Array)
218
225
 
219
226
  arr
227
+ .map { |str| sanitize(str, encode:) }
228
+ .reject { |str| str.is_a?(String) && str.empty? }
229
+ .compact
230
+ .uniq
231
+ end
232
+
233
+ # Pretty prints a log statement, used for debugging purposes.
234
+ #
235
+ # Use like:
236
+ #
237
+ # ```
238
+ # Wgit::Utils.pprint 1, include_html: include_html, ignore: ignore_vars
239
+ # ```
240
+ #
241
+ # Which produces a log like:
242
+ #
243
+ # ```
244
+ # DEBUG_1 - include_html: true | ignore: ['@html', '@parser']
245
+ # ```
246
+ #
247
+ # @param identifier [#to_s] A log identifier e.g. "START" or 1 etc.
248
+ # @param stream [#puts] Any object that respond_to? :puts and :print. It is
249
+ # used to output the log text somewhere e.g. a file or STDERR.
250
+ # @param prefix [String] The log prefix, useful for visibility/greping.
251
+ # @param new_line [Boolean] Wether or not to use a new line (\n) as the
252
+ # separator.
253
+ # @param vars [Hash<#inspect, #inspect>] The vars to inspect in the log.
254
+ def self.pprint(identifier, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
255
+ sep1 = new_line ? "\n" : ' - '
256
+ sep2 = new_line ? "\n" : ' | '
257
+
258
+ stream.print "\n#{prefix}_#{identifier}#{sep1}"
259
+
260
+ vars.each_with_index do |arr, i|
261
+ last_item = (i + 1) == vars.size
262
+ sep3 = sep2
263
+ sep3 = new_line ? "\n" : '' if last_item
264
+ k, v = arr
265
+
266
+ stream.print "#{k}: #{v}#{sep3}"
267
+ end
268
+
269
+ stream.puts "\n"
270
+ stream.puts "\n" unless new_line
271
+
272
+ nil
220
273
  end
221
274
  end
222
275
  end
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.10.8'
9
+ VERSION = '0.11.0'
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
data/lib/wgit.rb CHANGED
@@ -10,6 +10,7 @@ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
+ require_relative 'wgit/robots_parser'
13
14
  require_relative 'wgit/indexer'
14
15
  require_relative 'wgit/dsl'
15
16
  require_relative 'wgit/base'