wgit 0.11.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/model.rb ADDED
@@ -0,0 +1,164 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "./utils"
4
+
5
+ module Wgit
6
+ # Module used to build the Database collection objects, forming a data model.
7
+ # The models produced are Hash like and therefore DB agnostic. Each model
8
+ # will contain a unique field used for searching and avoiding duplicates,
9
+ # this is typically a `url` field. Also contained in the model are the
10
+ # search fields used in Database and Document #search calls.
11
+ module Model
12
+ # The default search fields used in Database and Document #search calls.
13
+ # The number of matches for each field is multiplied by the field weight,
14
+ # the total is the search score, used to sort the search results.
15
+ # Call Wgit::Model.set_default_search_fields` to revert to default.
16
+ DEFAULT_SEARCH_FIELDS = {
17
+ title: 2,
18
+ description: 2,
19
+ keywords: 2,
20
+ text: 1
21
+ }.freeze
22
+
23
+ # The search fields used in Database and Document #search calls.
24
+ # The number of matches for each field is multiplied by the field weight,
25
+ # the total is the search score, used to sort the search results.
26
+ # Call Wgit::Model.set_default_search_fields` to revert to default.
27
+ @search_fields = DEFAULT_SEARCH_FIELDS
28
+
29
+ # Whether or not to include the Document#html in the #document model.
30
+ @include_doc_html = false
31
+
32
+ # Whether or not to include the Document#score in the #document model.
33
+ @include_doc_score = false
34
+
35
+ class << self
36
+ # The search fields used in Database and Document #search calls.
37
+ # A custom setter method is also provided for changing these fields.
38
+ attr_reader :search_fields
39
+
40
+ # Whether or not to include the Document#html in the #document model.
41
+ attr_accessor :include_doc_html
42
+
43
+ # Whether or not to include the Document#score in the #document model.
44
+ attr_accessor :include_doc_score
45
+ end
46
+
47
+ # Sets the search fields used in Database and Document #search calls.
48
+ #
49
+ # You can pass the fields as an Array of Symbols which gives each field a
50
+ # weight of 1 meaning all fields are considered of equal value. Or you can
51
+ # pass a Hash of Symbol => Int and specify the weights yourself, allowing
52
+ # you to customise the search rankings.
53
+ #
54
+ # Use like:
55
+ # ```
56
+ # Wgit::Model.set_search_fields [:title, :text], db
57
+ # => { title: 1, text: 1 }
58
+ # Wgit::Model.set_search_fields {title: 2, text: 1}, db
59
+ # => { title: 2, text: 1 }
60
+ # ```
61
+ #
62
+ # If the given db (database) param responds to #search_fields= then it will
63
+ # be called and given the fields to set. This should perform whatever the
64
+ # database adapter needs in order to search using the given fields e.g.
65
+ # creating a search index. Calling the DB enables the search_fields to be
66
+ # set globally within Wgit by one method call, this one.
67
+ #
68
+ # @param fields [Array<Symbol>, Hash<Symbol, Integer>] The field names or
69
+ # the field names with their coresponding search weights.
70
+ # @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
71
+ # db responds to #search_fields=, it will be called and given the fields.
72
+ # @raise [StandardError] If fields is of an incorrect type.
73
+ # @return [Hash<Symbol, Integer>] The fields and their weights.
74
+ def self.set_search_fields(fields, db = nil)
75
+ # We need a Hash of fields => weights (Symbols => Integers).
76
+ case fields
77
+ when Array # of Strings/Symbols.
78
+ fields = fields.map { |field| [field.to_sym, 1] }
79
+ when Hash # of Strings/Symbols and Integers.
80
+ fields = fields.map { |field, weight| [field.to_sym, weight.to_i] }
81
+ else
82
+ raise "fields must be an Array or Hash, not a #{fields.class}"
83
+ end
84
+
85
+ @search_fields = fields.to_h
86
+ db.search_fields = @search_fields if db.respond_to?(:search_fields=)
87
+
88
+ @search_fields
89
+ end
90
+
91
+ # Sets the search fields used in Database and Document #search calls.
92
+ #
93
+ # If the given db (database) param responds to #search_fields= then it will
94
+ # be called and given the fields to set. This should perform whatever the
95
+ # database adapter needs in order to search using the given fields e.g.
96
+ # creating a search index. Calling the DB enables the search_fields to be
97
+ # set globally within Wgit by one method call, this one.
98
+ #
99
+ # @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
100
+ # db responds to #search_fields=, it will be called and given the fields.
101
+ # @return [Hash<Symbol, Integer>] The fields and their weights.
102
+ def self.set_default_search_fields(db = nil)
103
+ set_search_fields(DEFAULT_SEARCH_FIELDS, db)
104
+ end
105
+
106
+ # The data model for a Wgit::Url collection object and for an embedded
107
+ # 'url' inside a Wgit::Document collection object.
108
+ #
109
+ # The unique field for this model is `model['url']`.
110
+ #
111
+ # @param url [Wgit::Url] The Url data object.
112
+ # @return [Hash] The URL model ready for DB insertion.
113
+ def self.url(url)
114
+ raise "url must respond_to? :to_h" unless url.respond_to?(:to_h)
115
+
116
+ model = url.to_h
117
+ select_bson_types(model)
118
+ end
119
+
120
+ # The data model for a Wgit::Document collection object.
121
+ #
122
+ # The unique field for this model is `model['url']['url']`.
123
+ #
124
+ # @param doc [Wgit::Document] The Document data object.
125
+ # @return [Hash] The Document model ready for DB insertion.
126
+ def self.document(doc)
127
+ raise "doc must respond_to? :to_h" unless doc.respond_to?(:to_h)
128
+
129
+ model = doc.to_h(
130
+ include_html: @include_doc_html, include_score: @include_doc_score
131
+ )
132
+ model["url"] = url(doc.url) # Expand Url String into full object.
133
+
134
+ select_bson_types(model)
135
+ end
136
+
137
+ # Common fields when inserting a record into the DB.
138
+ #
139
+ # @return [Hash] Insertion fields common to all models.
140
+ def self.common_insert_data
141
+ {
142
+ date_added: Wgit::Utils.time_stamp,
143
+ date_modified: Wgit::Utils.time_stamp
144
+ }
145
+ end
146
+
147
+ # Common fields when updating a record in the DB.
148
+ #
149
+ # @return [Hash] Update fields common to all models.
150
+ def self.common_update_data
151
+ {
152
+ date_modified: Wgit::Utils.time_stamp
153
+ }
154
+ end
155
+
156
+ # Returns the model having removed non bson types (for use with MongoDB).
157
+ #
158
+ # @param model_hash [Hash] The model Hash to sanitize.
159
+ # @return [Hash] The model Hash with non bson types removed.
160
+ def self.select_bson_types(model_hash)
161
+ model_hash.select { |_k, v| v.respond_to?(:bson_type) }
162
+ end
163
+ end
164
+ end
data/lib/wgit/response.rb CHANGED
@@ -27,7 +27,7 @@ module Wgit
27
27
 
28
28
  # Defaults some values and returns a "blank" Wgit::Response object.
29
29
  def initialize
30
- @body = ''
30
+ @body = ""
31
31
  @headers = {}
32
32
  @redirections = {}
33
33
  @total_time = 0.0
@@ -45,7 +45,7 @@ module Wgit
45
45
  # @param time [Float] The time to add to @total_time.
46
46
  # @return [Float] @total_time's new value.
47
47
  def add_total_time(time)
48
- @total_time += (time || 0.0)
48
+ @total_time += time || 0.0
49
49
  end
50
50
 
51
51
  # Sets the HTML response body.
@@ -53,7 +53,7 @@ module Wgit
53
53
  # @param str [String] The new HTML body.
54
54
  # @return [String] @body's new value.
55
55
  def body=(str)
56
- @body = (str || '')
56
+ @body = str || ""
57
57
  end
58
58
 
59
59
  # Returns the HTML response body or nil (if it's empty).
@@ -81,10 +81,7 @@ module Wgit
81
81
  return
82
82
  end
83
83
 
84
- @headers = headers.map do |k, v|
85
- k = k.downcase.gsub('-', '_').to_sym
86
- [k, v]
87
- end.to_h
84
+ @headers = headers.transform_keys { |k| k.downcase.gsub("-", "_").to_sym }
88
85
  end
89
86
 
90
87
  # Returns whether or not the response is 404 Not Found.
@@ -146,7 +143,7 @@ module Wgit
146
143
  # @return [Boolean] True if Wgit should not index this site, false
147
144
  # otherwise.
148
145
  def no_index?
149
- headers.fetch(:x_robots_tag, '').downcase.strip == 'noindex'
146
+ headers.fetch(:x_robots_tag, "").downcase.strip == "noindex"
150
147
  end
151
148
 
152
149
  alias_method :code, :status
@@ -7,15 +7,15 @@ module Wgit
7
7
  include Wgit::Assertable
8
8
 
9
9
  # Key representing the start of a comment.
10
- KEY_COMMENT = '#'
10
+ KEY_COMMENT = "#"
11
11
  # Key value separator used in robots.txt files.
12
- KEY_SEPARATOR = ':'
12
+ KEY_SEPARATOR = ":"
13
13
  # Key representing a user agent.
14
- KEY_USER_AGENT = 'User-agent'
14
+ KEY_USER_AGENT = "User-agent"
15
15
  # Key representing an allow URL rule.
16
- KEY_ALLOW = 'Allow'
16
+ KEY_ALLOW = "Allow"
17
17
  # Key representing a disallow URL rule.
18
- KEY_DISALLOW = 'Disallow'
18
+ KEY_DISALLOW = "Disallow"
19
19
 
20
20
  # Value representing the Wgit user agent.
21
21
  USER_AGENT_WGIT = :wgit
@@ -143,7 +143,7 @@ module Wgit
143
143
  return line unless line.count(KEY_SEPARATOR) == 1
144
144
 
145
145
  segs = line.split(KEY_SEPARATOR)
146
- return '' if segs.size == 1
146
+ return "" if segs.size == 1
147
147
 
148
148
  segs.last.strip
149
149
  end
@@ -176,13 +176,13 @@ module Wgit
176
176
 
177
177
  def parse_special_syntax(path)
178
178
  # Remove $ e.g. "/blah$" becomes "/blah"
179
- path = path.gsub('$', '')
179
+ path = path.gsub("$", "")
180
180
 
181
181
  # Remove any inline comments e.g. "/blah # comment" becomes "/blah"
182
182
  path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}")
183
183
 
184
184
  # Replace an empty path with * e.g. "Allow: " becomes "Allow: *"
185
- path = '*' if path.empty?
185
+ path = "*" if path.empty?
186
186
 
187
187
  path
188
188
  end
data/lib/wgit/url.rb CHANGED
@@ -1,9 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'utils'
4
- require_relative 'assertable'
5
- require 'uri'
6
- require 'addressable/uri'
3
+ require_relative "utils"
4
+ require_relative "assertable"
5
+ require "uri"
6
+ require "addressable/uri"
7
7
 
8
8
  module Wgit
9
9
  # Class modeling/serialising a web based HTTP URL.
@@ -56,11 +56,11 @@ module Wgit
56
56
  obj = url_or_obj
57
57
  assert_respond_to(obj, :fetch)
58
58
 
59
- url = obj.fetch('url') # Should always be present.
60
- crawled = obj.fetch('crawled', false)
61
- date_crawled = obj.fetch('date_crawled', nil)
62
- crawl_duration = obj.fetch('crawl_duration', nil)
63
- redirects = obj.fetch('redirects', {})
59
+ url = obj.fetch("url") # Should always be present.
60
+ crawled = obj.fetch("crawled", false)
61
+ date_crawled = obj.fetch("date_crawled", nil)
62
+ crawl_duration = obj.fetch("crawl_duration", nil)
63
+ redirects = obj.fetch("redirects", {})
64
64
  end
65
65
 
66
66
  @uri = Addressable::URI.parse(url)
@@ -89,7 +89,7 @@ module Wgit
89
89
  # @raise [StandardError] If obj.is_a?(String) is false.
90
90
  # @return [Wgit::Url] A Wgit::Url instance.
91
91
  def self.parse(obj)
92
- raise 'Can only parse if obj#is_a?(String)' unless obj.is_a?(String)
92
+ raise "Can only parse if obj#is_a?(String)" unless obj.is_a?(String)
93
93
 
94
94
  # Return a Wgit::Url as is to avoid losing state e.g. date_crawled etc.
95
95
  obj.is_a?(Wgit::Url) ? obj : new(obj)
@@ -98,7 +98,7 @@ module Wgit
98
98
  # Returns a Wgit::Url instance from Wgit::Url.parse, or nil if obj cannot
99
99
  # be parsed successfully e.g. the String is invalid.
100
100
  #
101
- # Use this method when you can't gaurentee that obj is parsable as a URL.
101
+ # Use this method when you can't guarantee that obj is parsable as a URL.
102
102
  # See Wgit::Url.parse for more information.
103
103
  #
104
104
  # @param obj [Object] The object to parse, which #is_a?(String).
@@ -227,7 +227,7 @@ Addressable::URI::InvalidURIError")
227
227
  def relative?(opts = {})
228
228
  defaults = { origin: nil, host: nil, domain: nil, brand: nil }
229
229
  opts = defaults.merge(opts)
230
- raise 'Url (self) cannot be empty' if empty?
230
+ raise "Url (self) cannot be empty" if empty?
231
231
 
232
232
  return false if scheme_relative?
233
233
  return true if @uri.relative?
@@ -295,11 +295,11 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
295
295
  # @return [Wgit::Url] self + separator + other, separator depends on other.
296
296
  def join(other)
297
297
  other = Wgit::Url.new(other)
298
- raise 'other must be relative' unless other.relative?
298
+ raise "other must be relative" unless other.relative?
299
299
 
300
300
  other = other.omit_leading_slash
301
- separator = %w[# ? .].include?(other[0]) ? '' : '/'
302
- separator = '' if end_with?('/')
301
+ separator = %w[# ? .].include?(other[0]) ? "" : "/"
302
+ separator = "" if end_with?("/")
303
303
  joined = self + separator + other
304
304
 
305
305
  Wgit::Url.new(joined)
@@ -335,7 +335,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
335
335
  # @return [Wgit::Url] Self in absolute form.
336
336
  def make_absolute(doc)
337
337
  assert_type(doc, Wgit::Document)
338
- raise 'Cannot make absolute when Document @url is not valid' \
338
+ raise "Cannot make absolute when Document @url is not valid" \
339
339
  unless doc.url.valid?
340
340
 
341
341
  return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
@@ -355,7 +355,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
355
355
 
356
356
  return self if absolute? && !scheme_relative?
357
357
 
358
- separator = scheme_relative? ? '' : '//'
358
+ separator = scheme_relative? ? "" : "//"
359
359
  Wgit::Url.new("#{scheme}:#{separator}#{self}")
360
360
  end
361
361
 
@@ -364,8 +364,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
364
364
  #
365
365
  # @return [Hash] self's instance vars as a Hash.
366
366
  def to_h
367
- h = Wgit::Utils.to_h(self, ignore: ['@uri'])
368
- Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
367
+ h = Wgit::Utils.to_h(self, ignore: ["@uri"])
368
+ Hash[h.to_a.insert(0, ["url", to_s])] # Insert url at position 0.
369
369
  end
370
370
 
371
371
  # Returns a normalised URI object for this URL.
@@ -440,7 +440,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
440
440
  dot_domain = ".#{to_domain}"
441
441
  return nil unless include?(dot_domain)
442
442
 
443
- sub_domain = to_host.sub(dot_domain, '')
443
+ sub_domain = to_host.sub(dot_domain, "")
444
444
  Wgit::Url.new(sub_domain)
445
445
  end
446
446
 
@@ -450,7 +450,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
450
450
  # @return [Wgit::Url, nil] Containing just the brand or nil.
451
451
  def to_brand
452
452
  domain = to_domain
453
- domain ? Wgit::Url.new(domain.split('.').first) : nil
453
+ domain ? Wgit::Url.new(domain.split(".").first) : nil
454
454
  end
455
455
 
456
456
  # Returns only the base of this URL e.g. the protocol scheme and host
@@ -486,7 +486,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
486
486
  def to_path
487
487
  path = @uri.path
488
488
  return nil if path.nil? || path.empty?
489
- return Wgit::Url.new('/') if path == '/'
489
+ return Wgit::Url.new("/") if path == "/"
490
490
 
491
491
  Wgit::Url.new(path).omit_leading_slash
492
492
  end
@@ -500,7 +500,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
500
500
  # an endpoint, / is returned.
501
501
  def to_endpoint
502
502
  endpoint = @uri.path
503
- endpoint = "/#{endpoint}" unless endpoint.start_with?('/')
503
+ endpoint = "/#{endpoint}" unless endpoint.start_with?("/")
504
504
  Wgit::Url.new(endpoint)
505
505
  end
506
506
 
@@ -524,8 +524,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
524
524
  query_str = to_query
525
525
  return {} unless query_str
526
526
 
527
- query_str.split('&').each_with_object({}) do |param, hash|
528
- k, v = param.split('=')
527
+ query_str.split("&").each_with_object({}) do |param, hash|
528
+ k, v = param.split("=")
529
529
  k = k.to_sym if symbolize_keys
530
530
  hash[k] = v
531
531
  end
@@ -548,7 +548,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
548
548
  path = to_path&.omit_trailing_slash
549
549
  return nil unless path
550
550
 
551
- segs = path.split('.')
551
+ segs = path.split(".")
552
552
  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
553
553
  end
554
554
 
@@ -591,7 +591,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
591
591
  #
592
592
  # @return [Wgit::Url] Self without a trailing slash.
593
593
  def omit_leading_slash
594
- start_with?('/') ? Wgit::Url.new(self[1..]) : self
594
+ start_with?("/") ? Wgit::Url.new(self[1..]) : self
595
595
  end
596
596
 
597
597
  # Returns a new Wgit::Url containing self without a trailing slash. Is
@@ -600,7 +600,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
600
600
  #
601
601
  # @return [Wgit::Url] Self without a trailing slash.
602
602
  def omit_trailing_slash
603
- end_with?('/') ? Wgit::Url.new(chop) : self
603
+ end_with?("/") ? Wgit::Url.new(chop) : self
604
604
  end
605
605
 
606
606
  # Returns a new Wgit::Url containing self without a leading or trailing
@@ -621,9 +621,9 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
621
621
  # @return [Wgit::Url] Self containing everything after the base.
622
622
  def omit_base
623
623
  base_url = to_base
624
- omit_base = base_url ? gsub(base_url, '') : self
624
+ omit_base = base_url ? gsub(base_url, "") : self
625
625
 
626
- return self if ['', '/'].include?(omit_base)
626
+ return self if ["", "/"].include?(omit_base)
627
627
 
628
628
  Wgit::Url.new(omit_base).omit_leading_slash
629
629
  end
@@ -636,9 +636,9 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
636
636
  # @return [Wgit::Url] Self containing everything after the origin.
637
637
  def omit_origin
638
638
  origin = to_origin
639
- omit_origin = origin ? gsub(origin, '') : self
639
+ omit_origin = origin ? gsub(origin, "") : self
640
640
 
641
- return self if ['', '/'].include?(omit_origin)
641
+ return self if ["", "/"].include?(omit_origin)
642
642
 
643
643
  Wgit::Url.new(omit_origin).omit_leading_slash
644
644
  end
@@ -652,7 +652,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
652
652
  # @return [Wgit::Url] Self with the query string portion removed.
653
653
  def omit_query
654
654
  query = to_query
655
- omit_query_string = query ? gsub("?#{query}", '') : self
655
+ omit_query_string = query ? gsub("?#{query}", "") : self
656
656
 
657
657
  Wgit::Url.new(omit_query_string)
658
658
  end
@@ -667,7 +667,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
667
667
  # @return [Wgit::Url] Self with the fragment portion removed.
668
668
  def omit_fragment
669
669
  fragment = to_fragment
670
- omit_fragment = fragment ? gsub("##{fragment}", '') : self
670
+ omit_fragment = fragment ? gsub("##{fragment}", "") : self
671
671
 
672
672
  Wgit::Url.new(omit_fragment)
673
673
  end
@@ -677,7 +677,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
677
677
  #
678
678
  # @return [Boolean] True if self is a query string, false otherwise.
679
679
  def query?
680
- start_with?('?')
680
+ start_with?("?")
681
681
  end
682
682
 
683
683
  # Returns true if self is a URL fragment e.g. #top etc. Note this
@@ -685,14 +685,14 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
685
685
  #
686
686
  # @return [Boolean] True if self is a fragment, false otherwise.
687
687
  def fragment?
688
- start_with?('#')
688
+ start_with?("#")
689
689
  end
690
690
 
691
691
  # Returns true if self equals '/' a.k.a. index.
692
692
  #
693
693
  # @return [Boolean] True if self equals '/', false otherwise.
694
694
  def index?
695
- self == '/'
695
+ self == "/"
696
696
  end
697
697
 
698
698
  # Returns true if self starts with '//' a.k.a a scheme/protocol relative
@@ -700,7 +700,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
700
700
  #
701
701
  # @return [Boolean] True if self starts with '//', false otherwise.
702
702
  def scheme_relative?
703
- start_with?('//')
703
+ start_with?("//")
704
704
  end
705
705
 
706
706
  alias_method :crawled?, :crawled