wgit 0.10.7 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/url.rb CHANGED
@@ -28,6 +28,9 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
+ # Record the redirects from the initial Url to the final Url.
32
+ attr_reader :redirects
33
+
31
34
  # Initializes a new instance of Wgit::Url which models a web based
32
35
  # HTTP URL.
33
36
  #
@@ -57,12 +60,14 @@ module Wgit
57
60
  crawled = obj.fetch('crawled', false)
58
61
  date_crawled = obj.fetch('date_crawled', nil)
59
62
  crawl_duration = obj.fetch('crawl_duration', nil)
63
+ redirects = obj.fetch('redirects', {})
60
64
  end
61
65
 
62
66
  @uri = Addressable::URI.parse(url)
63
67
  @crawled = crawled
64
68
  @date_crawled = date_crawled
65
69
  @crawl_duration = crawl_duration
70
+ @redirects = redirects || {}
66
71
 
67
72
  super(url)
68
73
  end
@@ -107,6 +112,34 @@ Addressable::URI::InvalidURIError")
107
112
  nil
108
113
  end
109
114
 
115
+ # Overrides String#inspect to distingiush this Url from a String.
116
+ #
117
+ # @return [String] A short textual representation of this Url.
118
+ def inspect
119
+ "#<Wgit::Url url=\"#{self}\" crawled=#{@crawled}>"
120
+ end
121
+
122
+ # Overrides String#replace setting the new_url @uri and String value.
123
+ #
124
+ # @param new_url [Wgit::Url, String] The new URL value.
125
+ # @return [String] The new URL value once set.
126
+ def replace(new_url)
127
+ @uri = Addressable::URI.parse(new_url)
128
+
129
+ super(new_url)
130
+ end
131
+
132
+ # Overrides String#concat which oddly returns a Wgit::Url object, and
133
+ # instead returns a String. Therefore this method works the same as if
134
+ # you call String#concat, or its alias String#+, which is desired for
135
+ # this method. If you want to join two Urls, use Wgit::Url#join method.
136
+ #
137
+ # @param other [String] The String to concat onto this one.
138
+ # @return [String] The new concatted String, not a Wgit::Url.
139
+ def concat(other)
140
+ to_s.concat(other.to_s)
141
+ end
142
+
110
143
  # Sets the @crawled instance var, also setting @date_crawled for
111
144
  # convenience.
112
145
  #
@@ -117,14 +150,48 @@ Addressable::URI::InvalidURIError")
117
150
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
118
151
  end
119
152
 
120
- # Overrides String#replace setting the new_url @uri and String value.
153
+ # Sets the @redirects instance var, mapping any Strings into Wgit::Urls.
121
154
  #
122
- # @param new_url [Wgit::Url, String] The new URL value.
123
- # @return [String] The new URL value once set.
124
- def replace(new_url)
125
- @uri = Addressable::URI.parse(new_url)
155
+ # @param redirects [Hash] The redirects Hash to set for this Url.
156
+ def redirects=(redirects)
157
+ assert_type(redirects, Hash)
126
158
 
127
- super(new_url)
159
+ map_to_url = proc do |url|
160
+ Wgit::Url.new(url.to_s, crawled: @crawled, date_crawled: @date_crawled)
161
+ end
162
+
163
+ @redirects = redirects
164
+ .map { |from, to| [map_to_url.call(from), map_to_url.call(to)] }
165
+ .to_h
166
+ end
167
+
168
+ # Returns the Wgit::Url's starting with the originally requested Url to be
169
+ # crawled, followed by each redirected to Url, finishing with the final
170
+ # crawled Url e.g.
171
+ #
172
+ # Example Url redirects journey (dictated by the webserver):
173
+ #
174
+ # ```
175
+ # http://example.com => 301 to https://example.com
176
+ # https://example.com => 301 to https://example.com/
177
+ # https://example.com/ => 200 OK (no more redirects, crawl complete)
178
+ # ```
179
+ #
180
+ # Would return an Array of Wgit::Url's in the form of:
181
+ #
182
+ # ```
183
+ # %w(
184
+ # http://example.com
185
+ # https://example.com
186
+ # https://example.com/
187
+ # )
188
+ # ```
189
+ #
190
+ # @return [Array<Wgit::Url>] Each redirected to Url's finishing with the
191
+ # final (successfully) crawled Url. If no redirects took place, then just
192
+ # the originally requested Url is returned inside the Array.
193
+ def redirects_journey
194
+ [redirects.keys, self].flatten
128
195
  end
129
196
 
130
197
  # Returns true if self is a relative Url; false if absolute.
@@ -163,7 +230,7 @@ Addressable::URI::InvalidURIError")
163
230
  raise 'Url (self) cannot be empty' if empty?
164
231
 
165
232
  return false if scheme_relative?
166
- return true if @uri.relative?
233
+ return true if @uri.relative?
167
234
 
168
235
  # Self is absolute but may be relative to the opts param e.g. host.
169
236
  opts.select! { |_k, v| v }
@@ -219,22 +286,23 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
219
286
  !valid?
220
287
  end
221
288
 
222
- # Concats self and other together before returning a new Url. Self is not
223
- # modified.
289
+ # Joins self and other together before returning a new Url. Self is not
290
+ # modified. Some magic occurs depending on what is being joined, see
291
+ # the source code for more information.
224
292
  #
225
- # @param other [Wgit::Url, String] The other to concat to the end of self.
293
+ # @param other [Wgit::Url, String] The other (relative) Url to join to the
294
+ # end of self.
226
295
  # @return [Wgit::Url] self + separator + other, separator depends on other.
227
- def concat(other)
296
+ def join(other)
228
297
  other = Wgit::Url.new(other)
229
298
  raise 'other must be relative' unless other.relative?
230
299
 
231
300
  other = other.omit_leading_slash
232
301
  separator = %w[# ? .].include?(other[0]) ? '' : '/'
302
+ separator = '' if end_with?('/')
303
+ joined = self + separator + other
233
304
 
234
- # We use to_s below to call String#+, not Wgit::Url#+ (alias for concat).
235
- concatted = omit_trailing_slash.to_s + separator.to_s + other.to_s
236
-
237
- Wgit::Url.new(concatted)
305
+ Wgit::Url.new(joined)
238
306
  end
239
307
 
240
308
  # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
@@ -250,7 +318,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
250
318
  #
251
319
  # If self is absolute then it's returned as is, making this method
252
320
  # idempotent. The doc's `<base>` element is used if present, otherwise
253
- # `doc.url` is used as the base; which is concatted with self.
321
+ # `doc.url` is used as the base; which is joined with self.
254
322
  #
255
323
  # Typically used to build an absolute link obtained from a document.
256
324
  #
@@ -260,7 +328,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
260
328
  #
261
329
  # link.make_absolute(doc) # => "http://example.com/favicon.png"
262
330
  #
263
- # @param doc [Wgit::Document] The doc whose base Url is concatted with
331
+ # @param doc [Wgit::Document] The doc whose base Url is joined with
264
332
  # self.
265
333
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
266
334
  # raises an Exception.
@@ -272,7 +340,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
272
340
 
273
341
  return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
274
342
 
275
- absolute? ? self : doc.base_url(link: self).concat(self)
343
+ absolute? ? self : doc.base_url(link: self).join(self)
276
344
  end
277
345
 
278
346
  # Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
@@ -420,7 +488,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
420
488
  return nil if path.nil? || path.empty?
421
489
  return Wgit::Url.new('/') if path == '/'
422
490
 
423
- Wgit::Url.new(path).omit_slashes
491
+ Wgit::Url.new(path).omit_leading_slash
424
492
  end
425
493
 
426
494
  # Returns the endpoint of this URL e.g. the bit after the host with any
@@ -432,7 +500,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
432
500
  # an endpoint, / is returned.
433
501
  def to_endpoint
434
502
  endpoint = @uri.path
435
- endpoint = '/' + endpoint unless endpoint.start_with?('/')
503
+ endpoint = "/#{endpoint}" unless endpoint.start_with?('/')
436
504
  Wgit::Url.new(endpoint)
437
505
  end
438
506
 
@@ -477,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
477
545
  #
478
546
  # @return [Wgit::Url, nil] Containing just the extension string or nil.
479
547
  def to_extension
480
- path = to_path
548
+ path = to_path&.omit_trailing_slash
481
549
  return nil unless path
482
550
 
483
551
  segs = path.split('.')
@@ -523,7 +591,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
523
591
  #
524
592
  # @return [Wgit::Url] Self without a trailing slash.
525
593
  def omit_leading_slash
526
- start_with?('/') ? Wgit::Url.new(self[1..-1]) : self
594
+ start_with?('/') ? Wgit::Url.new(self[1..]) : self
527
595
  end
528
596
 
529
597
  # Returns a new Wgit::Url containing self without a trailing slash. Is
@@ -557,7 +625,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
557
625
 
558
626
  return self if ['', '/'].include?(omit_base)
559
627
 
560
- Wgit::Url.new(omit_base).omit_slashes
628
+ Wgit::Url.new(omit_base).omit_leading_slash
561
629
  end
562
630
 
563
631
  # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
@@ -572,7 +640,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
572
640
 
573
641
  return self if ['', '/'].include?(omit_origin)
574
642
 
575
- Wgit::Url.new(omit_origin).omit_slashes
643
+ Wgit::Url.new(omit_origin).omit_leading_slash
576
644
  end
577
645
 
578
646
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
@@ -635,32 +703,31 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
635
703
  start_with?('//')
636
704
  end
637
705
 
638
- alias + concat
639
- alias crawled? crawled
640
- alias is_relative? relative?
641
- alias is_absolute? absolute?
642
- alias is_valid? valid?
643
- alias is_query? query?
644
- alias is_fragment? fragment?
645
- alias is_index? index?
646
- alias is_scheme_relative? scheme_relative?
647
- alias uri to_uri
648
- alias url to_url
649
- alias scheme to_scheme
650
- alias host to_host
651
- alias port to_port
652
- alias domain to_domain
653
- alias brand to_brand
654
- alias base to_base
655
- alias origin to_origin
656
- alias path to_path
657
- alias endpoint to_endpoint
658
- alias query to_query
659
- alias query_hash to_query_hash
660
- alias fragment to_fragment
661
- alias extension to_extension
662
- alias user to_user
663
- alias password to_password
664
- alias sub_domain to_sub_domain
706
+ alias_method :crawled?, :crawled
707
+ alias_method :is_relative?, :relative?
708
+ alias_method :is_absolute?, :absolute?
709
+ alias_method :is_valid?, :valid?
710
+ alias_method :is_query?, :query?
711
+ alias_method :is_fragment?, :fragment?
712
+ alias_method :is_index?, :index?
713
+ alias_method :is_scheme_relative?, :scheme_relative?
714
+ alias_method :uri, :to_uri
715
+ alias_method :url, :to_url
716
+ alias_method :scheme, :to_scheme
717
+ alias_method :host, :to_host
718
+ alias_method :port, :to_port
719
+ alias_method :domain, :to_domain
720
+ alias_method :brand, :to_brand
721
+ alias_method :base, :to_base
722
+ alias_method :origin, :to_origin
723
+ alias_method :path, :to_path
724
+ alias_method :endpoint, :to_endpoint
725
+ alias_method :query, :to_query
726
+ alias_method :query_hash, :to_query_hash
727
+ alias_method :fragment, :to_fragment
728
+ alias_method :extension, :to_extension
729
+ alias_method :user, :to_user
730
+ alias_method :password, :to_password
731
+ alias_method :sub_domain, :to_sub_domain
665
732
  end
666
733
  end
data/lib/wgit/utils.rb CHANGED
@@ -23,7 +23,7 @@ module Wgit
23
23
  obj.instance_variables.each do |var|
24
24
  next if ignore.include?(var.to_s)
25
25
 
26
- key = var.to_s[1..-1] # Remove the @ prefix.
26
+ key = var.to_s[1..] # Remove the @ prefix.
27
27
  key = key.to_sym unless use_strings_as_keys
28
28
  hash[key] = obj.instance_variable_get(var)
29
29
  end
@@ -37,9 +37,9 @@ module Wgit
37
37
  # @yield [el] Gives each element (Object) of obj_or_objects if it's
38
38
  # Enumerable, otherwise obj_or_objs itself is given.
39
39
  # @return [Object] The obj_or_objs parameter is returned.
40
- def self.each(obj_or_objs)
40
+ def self.each(obj_or_objs, &block)
41
41
  if obj_or_objs.respond_to?(:each)
42
- obj_or_objs.each { |obj| yield(obj) }
42
+ obj_or_objs.each(&block)
43
43
  else
44
44
  yield(obj_or_objs)
45
45
  end
@@ -129,15 +129,13 @@ module Wgit
129
129
  # Prints out the search results in a search engine like format.
130
130
  # The format for each result looks like:
131
131
  #
132
+ # ```
132
133
  # Title
133
- #
134
134
  # Keywords (if there are some)
135
- #
136
135
  # Text Snippet (formatted to show the searched for query, if provided)
137
- #
138
136
  # URL
139
- #
140
137
  # <empty_line_seperator>
138
+ # ```
141
139
  #
142
140
  # @param results [Array<Wgit::Document>] Array of Wgit::Document's which
143
141
  # each have had #search!(query) called (to update it's @text with the
@@ -147,7 +145,7 @@ module Wgit
147
145
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
146
  # to output text somewhere e.g. a file or STDERR.
149
147
  # @return [Integer] The number of results.
150
- def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
148
+ def self.pprint_search_results(results, keyword_limit: 5, stream: $stdout)
151
149
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
152
150
 
153
151
  results.each do |doc|
@@ -167,56 +165,111 @@ module Wgit
167
165
  end
168
166
 
169
167
  # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
- # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
- # not in the case statement will be ignored and returned as is.
168
+ # method for its type e.g. if obj.is_a? String then sanitize_str(obj) is called.
169
+ # Any type not in the case statement will be ignored and returned as is.
170
+ # Call this method if unsure what obj's type is.
172
171
  #
173
172
  # @param obj [Object] The object to be sanitized.
174
173
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
174
  # invalid characters.
176
- # @return [Object] The sanitized obj is both modified and then returned.
175
+ # @return [Object] The sanitized obj.
177
176
  def self.sanitize(obj, encode: true)
178
177
  case obj
178
+ when Wgit::Url
179
+ sanitize_url(obj, encode:)
179
180
  when String
180
- sanitize_str(obj, encode: encode)
181
+ sanitize_str(obj, encode:)
181
182
  when Array
182
- sanitize_arr(obj, encode: encode)
183
+ sanitize_arr(obj, encode:)
183
184
  else
184
185
  obj
185
186
  end
186
187
  end
187
188
 
189
+ # Sanitises a Wgit::Url to make it uniform. First sanitizes the Url as a
190
+ # String before replacing the Url value with the sanitized version. This
191
+ # method therefore modifies the given url param and also returns it.
192
+ #
193
+ # @param url [Wgit::Url] The Wgit::Url to sanitize. url is modified.
194
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
195
+ # invalid characters.
196
+ # @return [Wgit::Url] The sanitized url, which is also modified.
197
+ def self.sanitize_url(url, encode: true)
198
+ str = sanitize_str(url.to_s, encode:)
199
+ url.replace(str)
200
+ end
201
+
188
202
  # Sanitises a String to make it uniform. Strips any leading/trailing white
189
203
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
190
204
  # `encode: true`.
191
205
  #
192
- # @param str [String] The String to sanitize. str is modified.
206
+ # @param str [String] The String to sanitize. str is not modified.
193
207
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
194
208
  # invalid characters.
195
- # @return [String] The sanitized str is both modified and then returned.
209
+ # @return [String] The sanitized str.
196
210
  def self.sanitize_str(str, encode: true)
197
- if str.is_a?(String)
198
- str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
199
- str.strip!
200
- end
211
+ return str unless str.is_a?(String)
201
212
 
202
- str
213
+ str = str.encode('UTF-8', undef: :replace, invalid: :replace) if encode
214
+ str.strip
203
215
  end
204
216
 
205
217
  # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
218
  # processes non empty Strings using Wgit::Utils.sanitize and removes
207
219
  # duplicates.
208
220
  #
209
- # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
- # @return [Enumerable] The sanitized arr is both modified and then returned.
221
+ # @param arr [Enumerable] The Array to sanitize. arr is not modified.
222
+ # @return [Enumerable] The sanitized arr.
211
223
  def self.sanitize_arr(arr, encode: true)
212
- if arr.is_a?(Array)
213
- arr.map! { |str| sanitize(str, encode: encode) }
214
- arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
215
- arr.compact!
216
- arr.uniq!
217
- end
224
+ return arr unless arr.is_a?(Array)
218
225
 
219
226
  arr
227
+ .map { |str| sanitize(str, encode:) }
228
+ .reject { |str| str.is_a?(String) && str.empty? }
229
+ .compact
230
+ .uniq
231
+ end
232
+
233
+ # Pretty prints a log statement, used for debugging purposes.
234
+ #
235
+ # Use like:
236
+ #
237
+ # ```
238
+ # Wgit::Utils.pprint 1, include_html: include_html, ignore: ignore_vars
239
+ # ```
240
+ #
241
+ # Which produces a log like:
242
+ #
243
+ # ```
244
+ # DEBUG_1 - include_html: true | ignore: ['@html', '@parser']
245
+ # ```
246
+ #
247
+ # @param identifier [#to_s] A log identifier e.g. "START" or 1 etc.
248
+ # @param stream [#puts] Any object that respond_to? :puts and :print. It is
249
+ # used to output the log text somewhere e.g. a file or STDERR.
250
+ # @param prefix [String] The log prefix, useful for visibility/greping.
251
+ # @param new_line [Boolean] Wether or not to use a new line (\n) as the
252
+ # separator.
253
+ # @param vars [Hash<#inspect, #inspect>] The vars to inspect in the log.
254
+ def self.pprint(identifier, stream: $stdout, prefix: 'DEBUG', new_line: false, **vars)
255
+ sep1 = new_line ? "\n" : ' - '
256
+ sep2 = new_line ? "\n" : ' | '
257
+
258
+ stream.print "\n#{prefix}_#{identifier}#{sep1}"
259
+
260
+ vars.each_with_index do |arr, i|
261
+ last_item = (i + 1) == vars.size
262
+ sep3 = sep2
263
+ sep3 = new_line ? "\n" : '' if last_item
264
+ k, v = arr
265
+
266
+ stream.print "#{k}: #{v}#{sep3}"
267
+ end
268
+
269
+ stream.puts "\n"
270
+ stream.puts "\n" unless new_line
271
+
272
+ nil
220
273
  end
221
274
  end
222
275
  end
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.10.7'
9
+ VERSION = '0.11.0'
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
data/lib/wgit.rb CHANGED
@@ -10,6 +10,7 @@ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
+ require_relative 'wgit/robots_parser'
13
14
  require_relative 'wgit/indexer'
14
15
  require_relative 'wgit/dsl'
15
16
  require_relative 'wgit/base'