wgit 0.10.8 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/url.rb CHANGED
@@ -28,6 +28,9 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
+ # Record the redirects from the initial Url to the final Url.
32
+ attr_reader :redirects
33
+
31
34
  # Initializes a new instance of Wgit::Url which models a web based
32
35
  # HTTP URL.
33
36
  #
@@ -57,12 +60,14 @@ module Wgit
57
60
  crawled = obj.fetch('crawled', false)
58
61
  date_crawled = obj.fetch('date_crawled', nil)
59
62
  crawl_duration = obj.fetch('crawl_duration', nil)
63
+ redirects = obj.fetch('redirects', {})
60
64
  end
61
65
 
62
66
  @uri = Addressable::URI.parse(url)
63
67
  @crawled = crawled
64
68
  @date_crawled = date_crawled
65
69
  @crawl_duration = crawl_duration
70
+ @redirects = redirects || {}
66
71
 
67
72
  super(url)
68
73
  end
@@ -107,16 +112,6 @@ Addressable::URI::InvalidURIError")
107
112
  nil
108
113
  end
109
114
 
110
- # Sets the @crawled instance var, also setting @date_crawled for
111
- # convenience.
112
- #
113
- # @param bool [Boolean] True if this Url has been crawled, false otherwise.
114
- # @return [Boolean] The value of bool having been set.
115
- def crawled=(bool)
116
- @crawled = bool
117
- @date_crawled = bool ? Wgit::Utils.time_stamp : nil
118
- end
119
-
120
115
  # Overrides String#inspect to distingiush this Url from a String.
121
116
  #
122
117
  # @return [String] A short textual representation of this Url.
@@ -134,6 +129,71 @@ Addressable::URI::InvalidURIError")
134
129
  super(new_url)
135
130
  end
136
131
 
132
+ # Overrides String#concat which oddly returns a Wgit::Url object, and
133
+ # instead returns a String. Therefore this method works the same as if
134
+ # you call String#concat, or its alias String#+, which is desired for
135
+ # this method. If you want to join two Urls, use Wgit::Url#join method.
136
+ #
137
+ # @param other [String] The String to concat onto this one.
138
+ # @return [String] The new concatted String, not a Wgit::Url.
139
+ def concat(other)
140
+ to_s.concat(other.to_s)
141
+ end
142
+
143
+ # Sets the @crawled instance var, also setting @date_crawled for
144
+ # convenience.
145
+ #
146
+ # @param bool [Boolean] True if this Url has been crawled, false otherwise.
147
+ # @return [Boolean] The value of bool having been set.
148
+ def crawled=(bool)
149
+ @crawled = bool
150
+ @date_crawled = bool ? Wgit::Utils.time_stamp : nil
151
+ end
152
+
153
+ # Sets the @redirects instance var, mapping any Strings into Wgit::Urls.
154
+ #
155
+ # @param redirects [Hash] The redirects Hash to set for this Url.
156
+ def redirects=(redirects)
157
+ assert_type(redirects, Hash)
158
+
159
+ map_to_url = proc do |url|
160
+ Wgit::Url.new(url.to_s, crawled: @crawled, date_crawled: @date_crawled)
161
+ end
162
+
163
+ @redirects = redirects
164
+ .map { |from, to| [map_to_url.call(from), map_to_url.call(to)] }
165
+ .to_h
166
+ end
167
+
168
+ # Returns the Wgit::Url's starting with the originally requested Url to be
169
+ # crawled, followed by each redirected to Url, finishing with the final
170
+ # crawled Url e.g.
171
+ #
172
+ # Example Url redirects journey (dictated by the webserver):
173
+ #
174
+ # ```
175
+ # http://example.com => 301 to https://example.com
176
+ # https://example.com => 301 to https://example.com/
177
+ # https://example.com/ => 200 OK (no more redirects, crawl complete)
178
+ # ```
179
+ #
180
+ # Would return an Array of Wgit::Url's in the form of:
181
+ #
182
+ # ```
183
+ # %w(
184
+ # http://example.com
185
+ # https://example.com
186
+ # https://example.com/
187
+ # )
188
+ # ```
189
+ #
190
+ # @return [Array<Wgit::Url>] Each redirected to Url's finishing with the
191
+ # final (successfully) crawled Url. If no redirects took place, then just
192
+ # the originally requested Url is returned inside the Array.
193
+ def redirects_journey
194
+ [redirects.keys, self].flatten
195
+ end
196
+
137
197
  # Returns true if self is a relative Url; false if absolute.
138
198
  #
139
199
  # An absolute URL must have a scheme prefix e.g.
@@ -170,7 +230,7 @@ Addressable::URI::InvalidURIError")
170
230
  raise 'Url (self) cannot be empty' if empty?
171
231
 
172
232
  return false if scheme_relative?
173
- return true if @uri.relative?
233
+ return true if @uri.relative?
174
234
 
175
235
  # Self is absolute but may be relative to the opts param e.g. host.
176
236
  opts.select! { |_k, v| v }
@@ -226,22 +286,23 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
226
286
  !valid?
227
287
  end
228
288
 
229
- # Concats self and other together before returning a new Url. Self is not
230
- # modified.
289
+ # Joins self and other together before returning a new Url. Self is not
290
+ # modified. Some magic occurs depending on what is being joined, see
291
+ # the source code for more information.
231
292
  #
232
- # @param other [Wgit::Url, String] The other to concat to the end of self.
293
+ # @param other [Wgit::Url, String] The other (relative) Url to join to the
294
+ # end of self.
233
295
  # @return [Wgit::Url] self + separator + other, separator depends on other.
234
- def concat(other)
296
+ def join(other)
235
297
  other = Wgit::Url.new(other)
236
298
  raise 'other must be relative' unless other.relative?
237
299
 
238
300
  other = other.omit_leading_slash
239
301
  separator = %w[# ? .].include?(other[0]) ? '' : '/'
302
+ separator = '' if end_with?('/')
303
+ joined = self + separator + other
240
304
 
241
- # We use to_s below to call String#+, not Wgit::Url#+ (alias for concat).
242
- concatted = omit_trailing_slash.to_s + separator.to_s + other.to_s
243
-
244
- Wgit::Url.new(concatted)
305
+ Wgit::Url.new(joined)
245
306
  end
246
307
 
247
308
  # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
@@ -257,7 +318,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
257
318
  #
258
319
  # If self is absolute then it's returned as is, making this method
259
320
  # idempotent. The doc's `<base>` element is used if present, otherwise
260
- # `doc.url` is used as the base; which is concatted with self.
321
+ # `doc.url` is used as the base; which is joined with self.
261
322
  #
262
323
  # Typically used to build an absolute link obtained from a document.
263
324
  #
@@ -267,7 +328,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
267
328
  #
268
329
  # link.make_absolute(doc) # => "http://example.com/favicon.png"
269
330
  #
270
- # @param doc [Wgit::Document] The doc whose base Url is concatted with
331
+ # @param doc [Wgit::Document] The doc whose base Url is joined with
271
332
  # self.
272
333
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
273
334
  # raises an Exception.
@@ -279,7 +340,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
279
340
 
280
341
  return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
281
342
 
282
- absolute? ? self : doc.base_url(link: self).concat(self)
343
+ absolute? ? self : doc.base_url(link: self).join(self)
283
344
  end
284
345
 
285
346
  # Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
@@ -427,7 +488,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
427
488
  return nil if path.nil? || path.empty?
428
489
  return Wgit::Url.new('/') if path == '/'
429
490
 
430
- Wgit::Url.new(path).omit_slashes
491
+ Wgit::Url.new(path).omit_leading_slash
431
492
  end
432
493
 
433
494
  # Returns the endpoint of this URL e.g. the bit after the host with any
@@ -439,7 +500,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
439
500
  # an endpoint, / is returned.
440
501
  def to_endpoint
441
502
  endpoint = @uri.path
442
- endpoint = '/' + endpoint unless endpoint.start_with?('/')
503
+ endpoint = "/#{endpoint}" unless endpoint.start_with?('/')
443
504
  Wgit::Url.new(endpoint)
444
505
  end
445
506
 
@@ -484,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
484
545
  #
485
546
  # @return [Wgit::Url, nil] Containing just the extension string or nil.
486
547
  def to_extension
487
- path = to_path
548
+ path = to_path&.omit_trailing_slash
488
549
  return nil unless path
489
550
 
490
551
  segs = path.split('.')
@@ -530,7 +591,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
530
591
  #
531
592
  # @return [Wgit::Url] Self without a trailing slash.
532
593
  def omit_leading_slash
533
- start_with?('/') ? Wgit::Url.new(self[1..-1]) : self
594
+ start_with?('/') ? Wgit::Url.new(self[1..]) : self
534
595
  end
535
596
 
536
597
  # Returns a new Wgit::Url containing self without a trailing slash. Is
@@ -564,7 +625,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
564
625
 
565
626
  return self if ['', '/'].include?(omit_base)
566
627
 
567
- Wgit::Url.new(omit_base).omit_slashes
628
+ Wgit::Url.new(omit_base).omit_leading_slash
568
629
  end
569
630
 
570
631
  # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
@@ -579,7 +640,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
579
640
 
580
641
  return self if ['', '/'].include?(omit_origin)
581
642
 
582
- Wgit::Url.new(omit_origin).omit_slashes
643
+ Wgit::Url.new(omit_origin).omit_leading_slash
583
644
  end
584
645
 
585
646
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
@@ -642,32 +703,31 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
642
703
  start_with?('//')
643
704
  end
644
705
 
645
- alias + concat
646
- alias crawled? crawled
647
- alias is_relative? relative?
648
- alias is_absolute? absolute?
649
- alias is_valid? valid?
650
- alias is_query? query?
651
- alias is_fragment? fragment?
652
- alias is_index? index?
653
- alias is_scheme_relative? scheme_relative?
654
- alias uri to_uri
655
- alias url to_url
656
- alias scheme to_scheme
657
- alias host to_host
658
- alias port to_port
659
- alias domain to_domain
660
- alias brand to_brand
661
- alias base to_base
662
- alias origin to_origin
663
- alias path to_path
664
- alias endpoint to_endpoint
665
- alias query to_query
666
- alias query_hash to_query_hash
667
- alias fragment to_fragment
668
- alias extension to_extension
669
- alias user to_user
670
- alias password to_password
671
- alias sub_domain to_sub_domain
706
+ alias_method :crawled?, :crawled
707
+ alias_method :is_relative?, :relative?
708
+ alias_method :is_absolute?, :absolute?
709
+ alias_method :is_valid?, :valid?
710
+ alias_method :is_query?, :query?
711
+ alias_method :is_fragment?, :fragment?
712
+ alias_method :is_index?, :index?
713
+ alias_method :is_scheme_relative?, :scheme_relative?
714
+ alias_method :uri, :to_uri
715
+ alias_method :url, :to_url
716
+ alias_method :scheme, :to_scheme
717
+ alias_method :host, :to_host
718
+ alias_method :port, :to_port
719
+ alias_method :domain, :to_domain
720
+ alias_method :brand, :to_brand
721
+ alias_method :base, :to_base
722
+ alias_method :origin, :to_origin
723
+ alias_method :path, :to_path
724
+ alias_method :endpoint, :to_endpoint
725
+ alias_method :query, :to_query
726
+ alias_method :query_hash, :to_query_hash
727
+ alias_method :fragment, :to_fragment
728
+ alias_method :extension, :to_extension
729
+ alias_method :user, :to_user
730
+ alias_method :password, :to_password
731
+ alias_method :sub_domain, :to_sub_domain
672
732
  end
673
733
  end
data/lib/wgit/utils.rb CHANGED
@@ -23,7 +23,7 @@ module Wgit
23
23
  obj.instance_variables.each do |var|
24
24
  next if ignore.include?(var.to_s)
25
25
 
26
- key = var.to_s[1..-1] # Remove the @ prefix.
26
+ key = var.to_s[1..] # Remove the @ prefix.
27
27
  key = key.to_sym unless use_strings_as_keys
28
28
  hash[key] = obj.instance_variable_get(var)
29
29
  end
@@ -37,9 +37,9 @@ module Wgit
37
37
  # @yield [el] Gives each element (Object) of obj_or_objects if it's
38
38
  # Enumerable, otherwise obj_or_objs itself is given.
39
39
  # @return [Object] The obj_or_objs parameter is returned.
40
- def self.each(obj_or_objs)
40
+ def self.each(obj_or_objs, &block)
41
41
  if obj_or_objs.respond_to?(:each)
42
- obj_or_objs.each { |obj| yield(obj) }
42
+ obj_or_objs.each(&block)
43
43
  else
44
44
  yield(obj_or_objs)
45
45
  end
@@ -129,15 +129,13 @@ module Wgit
129
129
  # Prints out the search results in a search engine like format.
130
130
  # The format for each result looks like:
131
131
  #
132
+ # ```
132
133
  # Title
133
- #
134
134
  # Keywords (if there are some)
135
- #
136
135
  # Text Snippet (formatted to show the searched for query, if provided)
137
- #
138
136
  # URL
139
- #
140
137
  # <empty_line_seperator>
138
+ # ```
141
139
  #
142
140
  # @param results [Array<Wgit::Document>] Array of Wgit::Document's which
143
141
  # each have had #search!(query) called (to update it's @text with the
@@ -147,7 +145,7 @@ module Wgit
147
145
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
146
  # to output text somewhere e.g. a file or STDERR.
149
147
  # @return [Integer] The number of results.
150
- def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
148
+ def self.pprint_search_results(results, keyword_limit: 5, stream: $stdout)
151
149
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
152
150
 
153
151
  results.each do |doc|
@@ -167,56 +165,111 @@ module Wgit
167
165
  end
168
166
 
169
167
  # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
- # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
- # not in the case statement will be ignored and returned as is.
168
+ # method for its type e.g. if obj.is_a? String then sanitize_str(obj) is called.
169
+ # Any type not in the case statement will be ignored and returned as is.
170
+ # Call this method if unsure what obj's type is.
172
171
  #
173
172
  # @param obj [Object] The object to be sanitized.
174
173
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
174
  # invalid characters.
176
- # @return [Object] The sanitized obj is both modified and then returned.
175
+ # @return [Object] The sanitized obj.
177
176
  def self.sanitize(obj, encode: true)
178
177
  case obj
178
+ when Wgit::Url
179
+ sanitize_url(obj, encode:)
179
180
  when String
180
- sanitize_str(obj, encode: encode)
181
+ sanitize_str(obj, encode:)
181
182
  when Array
182
- sanitize_arr(obj, encode: encode)
183
+ sanitize_arr(obj, encode:)
183
184
  else
184
185
  obj
185
186
  end
186
187
  end
187
188
 
189
+ # Sanitises a Wgit::Url to make it uniform. First sanitizes the Url as a
190
+ # String before replacing the Url value with the sanitized version. This
191
+ # method therefore modifies the given url param and also returns it.
192
+ #
193
+ # @param url [Wgit::Url] The Wgit::Url to sanitize. url is modified.
194
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
195
+ # invalid characters.
196
+ # @return [Wgit::Url] The sanitized url, which is also modified.
197
+ def self.sanitize_url(url, encode: true)
198
+ str = sanitize_str(url.to_s, encode:)
199
+ url.replace(str)
200
+ end
201
+
188
202
  # Sanitises a String to make it uniform. Strips any leading/trailing white
189
203
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
190
204
  # `encode: true`.
191
205
  #
192
- # @param str [String] The String to sanitize. str is modified.
206
+ # @param str [String] The String to sanitize. str is not modified.
193
207
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
194
208
  # invalid characters.
195
- # @return [String] The sanitized str is both modified and then returned.
209
+ # @return [String] The sanitized str.
196
210
  def self.sanitize_str(str, encode: true)
197
- if str.is_a?(String)
198
- str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
199
- str.strip!
200
- end
211
+ return str unless str.is_a?(String)
201
212
 
202
- str
213
+ str = str.encode('UTF-8', undef: :replace, invalid: :replace) if encode
214
+ str.strip
203
215
  end
204
216
 
205
217
  # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
218
  # processes non empty Strings using Wgit::Utils.sanitize and removes
207
219
  # duplicates.
208
220
  #
209
- # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
- # @return [Enumerable] The sanitized arr is both modified and then returned.
221
+ # @param arr [Enumerable] The Array to sanitize. arr is not modified.
222
+ # @return [Enumerable] The sanitized arr.
211
223
  def self.sanitize_arr(arr, encode: true)
212
- if arr.is_a?(Array)
213
- arr.map! { |str| sanitize(str, encode: encode) }
214
- arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
215
- arr.compact!
216
- arr.uniq!
217
- end
224
+ return arr unless arr.is_a?(Array)
218
225
 
219
226
  arr
227
+ .map { |str| sanitize(str, encode:) }
228
+ .reject { |str| str.is_a?(String) && str.empty? }
229
+ .compact
230
+ .uniq
231
+ end
232
+
233
+ # Pretty prints a log statement, used for debugging purposes.
234
+ #
235
+ # Use like:
236
+ #
237
+ # ```
238
+ # Wgit::Utils.pprint 1, include_html: include_html, ignore: ignore_vars
239
+ # ```
240
+ #
241
+ # Which produces a log like:
242
+ #
243
+ # ```
244
+ # DEBUG_1 - include_html: true | ignore: ['@html', '@parser']
245
+ # ```
246
+ #
247
+ # @param identifier [#to_s] A log identifier e.g. "START" or 1 etc.
248
+ # @param stream [#puts] Any object that respond_to? :puts and :print. It is
249
+ # used to output the log text somewhere e.g. a file or STDERR.
250
+ # @param prefix [String] The log prefix, useful for visibility/greping.
251
+ # @param new_line [Boolean] Wether or not to use a new line (\n) as the
252
+ # separator.
253
+ # @param vars [Hash<#inspect, #inspect>] The vars to inspect in the log.
254
+ def self.pprint(identifier, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
255
+ sep1 = new_line ? "\n" : ' - '
256
+ sep2 = new_line ? "\n" : ' | '
257
+
258
+ stream.print "\n#{prefix}_#{identifier}#{sep1}"
259
+
260
+ vars.each_with_index do |arr, i|
261
+ last_item = (i + 1) == vars.size
262
+ sep3 = sep2
263
+ sep3 = new_line ? "\n" : '' if last_item
264
+ k, v = arr
265
+
266
+ stream.print "#{k}: #{v}#{sep3}"
267
+ end
268
+
269
+ stream.puts "\n"
270
+ stream.puts "\n" unless new_line
271
+
272
+ nil
220
273
  end
221
274
  end
222
275
  end
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.10.8'
9
+ VERSION = '0.11.0'
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
data/lib/wgit.rb CHANGED
@@ -10,6 +10,7 @@ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
+ require_relative 'wgit/robots_parser'
13
14
  require_relative 'wgit/indexer'
14
15
  require_relative 'wgit/dsl'
15
16
  require_relative 'wgit/base'