wgit 0.10.7 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/url.rb CHANGED
@@ -28,6 +28,9 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
+ # Record the redirects from the initial Url to the final Url.
32
+ attr_reader :redirects
33
+
31
34
  # Initializes a new instance of Wgit::Url which models a web based
32
35
  # HTTP URL.
33
36
  #
@@ -57,12 +60,14 @@ module Wgit
57
60
  crawled = obj.fetch('crawled', false)
58
61
  date_crawled = obj.fetch('date_crawled', nil)
59
62
  crawl_duration = obj.fetch('crawl_duration', nil)
63
+ redirects = obj.fetch('redirects', {})
60
64
  end
61
65
 
62
66
  @uri = Addressable::URI.parse(url)
63
67
  @crawled = crawled
64
68
  @date_crawled = date_crawled
65
69
  @crawl_duration = crawl_duration
70
+ @redirects = redirects || {}
66
71
 
67
72
  super(url)
68
73
  end
@@ -107,6 +112,34 @@ Addressable::URI::InvalidURIError")
107
112
  nil
108
113
  end
109
114
 
115
+ # Overrides String#inspect to distingiush this Url from a String.
116
+ #
117
+ # @return [String] A short textual representation of this Url.
118
+ def inspect
119
+ "#<Wgit::Url url=\"#{self}\" crawled=#{@crawled}>"
120
+ end
121
+
122
+ # Overrides String#replace setting the new_url @uri and String value.
123
+ #
124
+ # @param new_url [Wgit::Url, String] The new URL value.
125
+ # @return [String] The new URL value once set.
126
+ def replace(new_url)
127
+ @uri = Addressable::URI.parse(new_url)
128
+
129
+ super(new_url)
130
+ end
131
+
132
+ # Overrides String#concat which oddly returns a Wgit::Url object, and
133
+ # instead returns a String. Therefore this method works the same as if
134
+ # you call String#concat, or its alias String#+, which is desired for
135
+ # this method. If you want to join two Urls, use Wgit::Url#join method.
136
+ #
137
+ # @param other [String] The String to concat onto this one.
138
+ # @return [String] The new concatted String, not a Wgit::Url.
139
+ def concat(other)
140
+ to_s.concat(other.to_s)
141
+ end
142
+
110
143
  # Sets the @crawled instance var, also setting @date_crawled for
111
144
  # convenience.
112
145
  #
@@ -117,14 +150,48 @@ Addressable::URI::InvalidURIError")
117
150
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
118
151
  end
119
152
 
120
- # Overrides String#replace setting the new_url @uri and String value.
153
+ # Sets the @redirects instance var, mapping any Strings into Wgit::Urls.
121
154
  #
122
- # @param new_url [Wgit::Url, String] The new URL value.
123
- # @return [String] The new URL value once set.
124
- def replace(new_url)
125
- @uri = Addressable::URI.parse(new_url)
155
+ # @param redirects [Hash] The redirects Hash to set for this Url.
156
+ def redirects=(redirects)
157
+ assert_type(redirects, Hash)
126
158
 
127
- super(new_url)
159
+ map_to_url = proc do |url|
160
+ Wgit::Url.new(url.to_s, crawled: @crawled, date_crawled: @date_crawled)
161
+ end
162
+
163
+ @redirects = redirects
164
+ .map { |from, to| [map_to_url.call(from), map_to_url.call(to)] }
165
+ .to_h
166
+ end
167
+
168
+ # Returns the Wgit::Url's starting with the originally requested Url to be
169
+ # crawled, followed by each redirected to Url, finishing with the final
170
+ # crawled Url e.g.
171
+ #
172
+ # Example Url redirects journey (dictated by the webserver):
173
+ #
174
+ # ```
175
+ # http://example.com => 301 to https://example.com
176
+ # https://example.com => 301 to https://example.com/
177
+ # https://example.com/ => 200 OK (no more redirects, crawl complete)
178
+ # ```
179
+ #
180
+ # Would return an Array of Wgit::Url's in the form of:
181
+ #
182
+ # ```
183
+ # %w(
184
+ # http://example.com
185
+ # https://example.com
186
+ # https://example.com/
187
+ # )
188
+ # ```
189
+ #
190
+ # @return [Array<Wgit::Url>] Each redirected to Url's finishing with the
191
+ # final (successfully) crawled Url. If no redirects took place, then just
192
+ # the originally requested Url is returned inside the Array.
193
+ def redirects_journey
194
+ [redirects.keys, self].flatten
128
195
  end
129
196
 
130
197
  # Returns true if self is a relative Url; false if absolute.
@@ -163,7 +230,7 @@ Addressable::URI::InvalidURIError")
163
230
  raise 'Url (self) cannot be empty' if empty?
164
231
 
165
232
  return false if scheme_relative?
166
- return true if @uri.relative?
233
+ return true if @uri.relative?
167
234
 
168
235
  # Self is absolute but may be relative to the opts param e.g. host.
169
236
  opts.select! { |_k, v| v }
@@ -219,22 +286,23 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
219
286
  !valid?
220
287
  end
221
288
 
222
- # Concats self and other together before returning a new Url. Self is not
223
- # modified.
289
+ # Joins self and other together before returning a new Url. Self is not
290
+ # modified. Some magic occurs depending on what is being joined, see
291
+ # the source code for more information.
224
292
  #
225
- # @param other [Wgit::Url, String] The other to concat to the end of self.
293
+ # @param other [Wgit::Url, String] The other (relative) Url to join to the
294
+ # end of self.
226
295
  # @return [Wgit::Url] self + separator + other, separator depends on other.
227
- def concat(other)
296
+ def join(other)
228
297
  other = Wgit::Url.new(other)
229
298
  raise 'other must be relative' unless other.relative?
230
299
 
231
300
  other = other.omit_leading_slash
232
301
  separator = %w[# ? .].include?(other[0]) ? '' : '/'
302
+ separator = '' if end_with?('/')
303
+ joined = self + separator + other
233
304
 
234
- # We use to_s below to call String#+, not Wgit::Url#+ (alias for concat).
235
- concatted = omit_trailing_slash.to_s + separator.to_s + other.to_s
236
-
237
- Wgit::Url.new(concatted)
305
+ Wgit::Url.new(joined)
238
306
  end
239
307
 
240
308
  # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
@@ -250,7 +318,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
250
318
  #
251
319
  # If self is absolute then it's returned as is, making this method
252
320
  # idempotent. The doc's `<base>` element is used if present, otherwise
253
- # `doc.url` is used as the base; which is concatted with self.
321
+ # `doc.url` is used as the base; which is joined with self.
254
322
  #
255
323
  # Typically used to build an absolute link obtained from a document.
256
324
  #
@@ -260,7 +328,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
260
328
  #
261
329
  # link.make_absolute(doc) # => "http://example.com/favicon.png"
262
330
  #
263
- # @param doc [Wgit::Document] The doc whose base Url is concatted with
331
+ # @param doc [Wgit::Document] The doc whose base Url is joined with
264
332
  # self.
265
333
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
266
334
  # raises an Exception.
@@ -272,7 +340,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
272
340
 
273
341
  return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
274
342
 
275
- absolute? ? self : doc.base_url(link: self).concat(self)
343
+ absolute? ? self : doc.base_url(link: self).join(self)
276
344
  end
277
345
 
278
346
  # Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
@@ -420,7 +488,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
420
488
  return nil if path.nil? || path.empty?
421
489
  return Wgit::Url.new('/') if path == '/'
422
490
 
423
- Wgit::Url.new(path).omit_slashes
491
+ Wgit::Url.new(path).omit_leading_slash
424
492
  end
425
493
 
426
494
  # Returns the endpoint of this URL e.g. the bit after the host with any
@@ -432,7 +500,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
432
500
  # an endpoint, / is returned.
433
501
  def to_endpoint
434
502
  endpoint = @uri.path
435
- endpoint = '/' + endpoint unless endpoint.start_with?('/')
503
+ endpoint = "/#{endpoint}" unless endpoint.start_with?('/')
436
504
  Wgit::Url.new(endpoint)
437
505
  end
438
506
 
@@ -477,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
477
545
  #
478
546
  # @return [Wgit::Url, nil] Containing just the extension string or nil.
479
547
  def to_extension
480
- path = to_path
548
+ path = to_path&.omit_trailing_slash
481
549
  return nil unless path
482
550
 
483
551
  segs = path.split('.')
@@ -523,7 +591,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
523
591
  #
524
592
  # @return [Wgit::Url] Self without a trailing slash.
525
593
  def omit_leading_slash
526
- start_with?('/') ? Wgit::Url.new(self[1..-1]) : self
594
+ start_with?('/') ? Wgit::Url.new(self[1..]) : self
527
595
  end
528
596
 
529
597
  # Returns a new Wgit::Url containing self without a trailing slash. Is
@@ -557,7 +625,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
557
625
 
558
626
  return self if ['', '/'].include?(omit_base)
559
627
 
560
- Wgit::Url.new(omit_base).omit_slashes
628
+ Wgit::Url.new(omit_base).omit_leading_slash
561
629
  end
562
630
 
563
631
  # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
@@ -572,7 +640,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
572
640
 
573
641
  return self if ['', '/'].include?(omit_origin)
574
642
 
575
- Wgit::Url.new(omit_origin).omit_slashes
643
+ Wgit::Url.new(omit_origin).omit_leading_slash
576
644
  end
577
645
 
578
646
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
@@ -635,32 +703,31 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
635
703
  start_with?('//')
636
704
  end
637
705
 
638
- alias + concat
639
- alias crawled? crawled
640
- alias is_relative? relative?
641
- alias is_absolute? absolute?
642
- alias is_valid? valid?
643
- alias is_query? query?
644
- alias is_fragment? fragment?
645
- alias is_index? index?
646
- alias is_scheme_relative? scheme_relative?
647
- alias uri to_uri
648
- alias url to_url
649
- alias scheme to_scheme
650
- alias host to_host
651
- alias port to_port
652
- alias domain to_domain
653
- alias brand to_brand
654
- alias base to_base
655
- alias origin to_origin
656
- alias path to_path
657
- alias endpoint to_endpoint
658
- alias query to_query
659
- alias query_hash to_query_hash
660
- alias fragment to_fragment
661
- alias extension to_extension
662
- alias user to_user
663
- alias password to_password
664
- alias sub_domain to_sub_domain
706
+ alias_method :crawled?, :crawled
707
+ alias_method :is_relative?, :relative?
708
+ alias_method :is_absolute?, :absolute?
709
+ alias_method :is_valid?, :valid?
710
+ alias_method :is_query?, :query?
711
+ alias_method :is_fragment?, :fragment?
712
+ alias_method :is_index?, :index?
713
+ alias_method :is_scheme_relative?, :scheme_relative?
714
+ alias_method :uri, :to_uri
715
+ alias_method :url, :to_url
716
+ alias_method :scheme, :to_scheme
717
+ alias_method :host, :to_host
718
+ alias_method :port, :to_port
719
+ alias_method :domain, :to_domain
720
+ alias_method :brand, :to_brand
721
+ alias_method :base, :to_base
722
+ alias_method :origin, :to_origin
723
+ alias_method :path, :to_path
724
+ alias_method :endpoint, :to_endpoint
725
+ alias_method :query, :to_query
726
+ alias_method :query_hash, :to_query_hash
727
+ alias_method :fragment, :to_fragment
728
+ alias_method :extension, :to_extension
729
+ alias_method :user, :to_user
730
+ alias_method :password, :to_password
731
+ alias_method :sub_domain, :to_sub_domain
665
732
  end
666
733
  end
data/lib/wgit/utils.rb CHANGED
@@ -23,7 +23,7 @@ module Wgit
23
23
  obj.instance_variables.each do |var|
24
24
  next if ignore.include?(var.to_s)
25
25
 
26
- key = var.to_s[1..-1] # Remove the @ prefix.
26
+ key = var.to_s[1..] # Remove the @ prefix.
27
27
  key = key.to_sym unless use_strings_as_keys
28
28
  hash[key] = obj.instance_variable_get(var)
29
29
  end
@@ -37,9 +37,9 @@ module Wgit
37
37
  # @yield [el] Gives each element (Object) of obj_or_objects if it's
38
38
  # Enumerable, otherwise obj_or_objs itself is given.
39
39
  # @return [Object] The obj_or_objs parameter is returned.
40
- def self.each(obj_or_objs)
40
+ def self.each(obj_or_objs, &block)
41
41
  if obj_or_objs.respond_to?(:each)
42
- obj_or_objs.each { |obj| yield(obj) }
42
+ obj_or_objs.each(&block)
43
43
  else
44
44
  yield(obj_or_objs)
45
45
  end
@@ -129,15 +129,13 @@ module Wgit
129
129
  # Prints out the search results in a search engine like format.
130
130
  # The format for each result looks like:
131
131
  #
132
+ # ```
132
133
  # Title
133
- #
134
134
  # Keywords (if there are some)
135
- #
136
135
  # Text Snippet (formatted to show the searched for query, if provided)
137
- #
138
136
  # URL
139
- #
140
137
  # <empty_line_seperator>
138
+ # ```
141
139
  #
142
140
  # @param results [Array<Wgit::Document>] Array of Wgit::Document's which
143
141
  # each have had #search!(query) called (to update it's @text with the
@@ -147,7 +145,7 @@ module Wgit
147
145
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
146
  # to output text somewhere e.g. a file or STDERR.
149
147
  # @return [Integer] The number of results.
150
- def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
148
+ def self.pprint_search_results(results, keyword_limit: 5, stream: $stdout)
151
149
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
152
150
 
153
151
  results.each do |doc|
@@ -167,56 +165,111 @@ module Wgit
167
165
  end
168
166
 
169
167
  # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
- # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
- # not in the case statement will be ignored and returned as is.
168
+ # method for its type e.g. if obj.is_a? String then sanitize_str(obj) is called.
169
+ # Any type not in the case statement will be ignored and returned as is.
170
+ # Call this method if unsure what obj's type is.
172
171
  #
173
172
  # @param obj [Object] The object to be sanitized.
174
173
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
174
  # invalid characters.
176
- # @return [Object] The sanitized obj is both modified and then returned.
175
+ # @return [Object] The sanitized obj.
177
176
  def self.sanitize(obj, encode: true)
178
177
  case obj
178
+ when Wgit::Url
179
+ sanitize_url(obj, encode:)
179
180
  when String
180
- sanitize_str(obj, encode: encode)
181
+ sanitize_str(obj, encode:)
181
182
  when Array
182
- sanitize_arr(obj, encode: encode)
183
+ sanitize_arr(obj, encode:)
183
184
  else
184
185
  obj
185
186
  end
186
187
  end
187
188
 
189
+ # Sanitises a Wgit::Url to make it uniform. First sanitizes the Url as a
190
+ # String before replacing the Url value with the sanitized version. This
191
+ # method therefore modifies the given url param and also returns it.
192
+ #
193
+ # @param url [Wgit::Url] The Wgit::Url to sanitize. url is modified.
194
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
195
+ # invalid characters.
196
+ # @return [Wgit::Url] The sanitized url, which is also modified.
197
+ def self.sanitize_url(url, encode: true)
198
+ str = sanitize_str(url.to_s, encode:)
199
+ url.replace(str)
200
+ end
201
+
188
202
  # Sanitises a String to make it uniform. Strips any leading/trailing white
189
203
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
190
204
  # `encode: true`.
191
205
  #
192
- # @param str [String] The String to sanitize. str is modified.
206
+ # @param str [String] The String to sanitize. str is not modified.
193
207
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
194
208
  # invalid characters.
195
- # @return [String] The sanitized str is both modified and then returned.
209
+ # @return [String] The sanitized str.
196
210
  def self.sanitize_str(str, encode: true)
197
- if str.is_a?(String)
198
- str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
199
- str.strip!
200
- end
211
+ return str unless str.is_a?(String)
201
212
 
202
- str
213
+ str = str.encode('UTF-8', undef: :replace, invalid: :replace) if encode
214
+ str.strip
203
215
  end
204
216
 
205
217
  # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
218
  # processes non empty Strings using Wgit::Utils.sanitize and removes
207
219
  # duplicates.
208
220
  #
209
- # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
- # @return [Enumerable] The sanitized arr is both modified and then returned.
221
+ # @param arr [Enumerable] The Array to sanitize. arr is not modified.
222
+ # @return [Enumerable] The sanitized arr.
211
223
  def self.sanitize_arr(arr, encode: true)
212
- if arr.is_a?(Array)
213
- arr.map! { |str| sanitize(str, encode: encode) }
214
- arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
215
- arr.compact!
216
- arr.uniq!
217
- end
224
+ return arr unless arr.is_a?(Array)
218
225
 
219
226
  arr
227
+ .map { |str| sanitize(str, encode:) }
228
+ .reject { |str| str.is_a?(String) && str.empty? }
229
+ .compact
230
+ .uniq
231
+ end
232
+
233
+ # Pretty prints a log statement, used for debugging purposes.
234
+ #
235
+ # Use like:
236
+ #
237
+ # ```
238
+ # Wgit::Utils.pprint 1, include_html: include_html, ignore: ignore_vars
239
+ # ```
240
+ #
241
+ # Which produces a log like:
242
+ #
243
+ # ```
244
+ # DEBUG_1 - include_html: true | ignore: ['@html', '@parser']
245
+ # ```
246
+ #
247
+ # @param identifier [#to_s] A log identifier e.g. "START" or 1 etc.
248
+ # @param stream [#puts] Any object that respond_to? :puts and :print. It is
249
+ # used to output the log text somewhere e.g. a file or STDERR.
250
+ # @param prefix [String] The log prefix, useful for visibility/greping.
251
+ # @param new_line [Boolean] Wether or not to use a new line (\n) as the
252
+ # separator.
253
+ # @param vars [Hash<#inspect, #inspect>] The vars to inspect in the log.
254
+ def self.pprint(identifier, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
255
+ sep1 = new_line ? "\n" : ' - '
256
+ sep2 = new_line ? "\n" : ' | '
257
+
258
+ stream.print "\n#{prefix}_#{identifier}#{sep1}"
259
+
260
+ vars.each_with_index do |arr, i|
261
+ last_item = (i + 1) == vars.size
262
+ sep3 = sep2
263
+ sep3 = new_line ? "\n" : '' if last_item
264
+ k, v = arr
265
+
266
+ stream.print "#{k}: #{v}#{sep3}"
267
+ end
268
+
269
+ stream.puts "\n"
270
+ stream.puts "\n" unless new_line
271
+
272
+ nil
220
273
  end
221
274
  end
222
275
  end
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.10.7'
9
+ VERSION = '0.11.0'
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
data/lib/wgit.rb CHANGED
@@ -10,6 +10,7 @@ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
+ require_relative 'wgit/robots_parser'
13
14
  require_relative 'wgit/indexer'
14
15
  require_relative 'wgit/dsl'
15
16
  require_relative 'wgit/base'