wgit 0.0.17 → 0.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  ### Default Document Extensions ###
2
4
 
3
5
  # Base.
@@ -5,9 +7,9 @@ Wgit::Document.define_extension(
5
7
  :base,
6
8
  '//base/@href',
7
9
  singleton: true,
8
- text_content_only: true,
10
+ text_content_only: true
9
11
  ) do |base|
10
- base = Wgit::Url.new(base) if base
12
+ Wgit::Url.new(base) if base
11
13
  end
12
14
 
13
15
  # Title.
@@ -15,7 +17,7 @@ Wgit::Document.define_extension(
15
17
  :title,
16
18
  '//title',
17
19
  singleton: true,
18
- text_content_only: true,
20
+ text_content_only: true
19
21
  )
20
22
 
21
23
  # Author.
@@ -23,7 +25,7 @@ Wgit::Document.define_extension(
23
25
  :author,
24
26
  '//meta[@name="author"]/@content',
25
27
  singleton: true,
26
- text_content_only: true,
28
+ text_content_only: true
27
29
  )
28
30
 
29
31
  # Keywords.
@@ -31,9 +33,9 @@ Wgit::Document.define_extension(
31
33
  :keywords,
32
34
  '//meta[@name="keywords"]/@content',
33
35
  singleton: true,
34
- text_content_only: true,
36
+ text_content_only: true
35
37
  ) do |keywords, source|
36
- if keywords and source == :html
38
+ if keywords && (source == :html)
37
39
  keywords = keywords.split(',')
38
40
  Wgit::Utils.process_arr(keywords)
39
41
  end
@@ -45,9 +47,9 @@ Wgit::Document.define_extension(
45
47
  :links,
46
48
  '//a/@href',
47
49
  singleton: false,
48
- text_content_only: true,
50
+ text_content_only: true
49
51
  ) do |links|
50
- links.map! { |link| Wgit::Url.new(link) } if links
52
+ links&.map! { |link| Wgit::Url.new(link) }
51
53
  end
52
54
 
53
55
  # Text.
@@ -55,5 +57,5 @@ Wgit::Document.define_extension(
55
57
  :text,
56
58
  proc { Wgit::Document.text_elements_xpath },
57
59
  singleton: false,
58
- text_content_only: true,
60
+ text_content_only: true
59
61
  )
data/lib/wgit/indexer.rb CHANGED
@@ -1,8 +1,9 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'crawler'
2
4
  require_relative 'database/database'
3
5
 
4
6
  module Wgit
5
-
6
7
  # Convience method to index the World Wide Web using
7
8
  # Wgit::Indexer#index_the_web.
8
9
  #
@@ -18,7 +19,7 @@ module Wgit
18
19
  # scraped from the web (default is 1GB). Note, that this value is used to
19
20
  # determine when to stop crawling; it's not a guarantee of the max data
20
21
  # that will be obtained.
21
- def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
22
+ def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1_048_576_000)
22
23
  db = Wgit::Database.new
23
24
  indexer = Wgit::Indexer.new(db)
24
25
  indexer.index_the_web(max_sites_to_crawl, max_data_size)
@@ -81,7 +82,6 @@ module Wgit
81
82
 
82
83
  # Class which sets up a crawler and saves the indexed docs to a database.
83
84
  class Indexer
84
-
85
85
  # The crawler used to scrape the WWW.
86
86
  attr_reader :crawler
87
87
 
@@ -109,19 +109,19 @@ module Wgit
109
109
  # scraped from the web (default is 1GB). Note, that this value is used to
110
110
  # determine when to stop crawling; it's not a guarantee of the max data
111
111
  # that will be obtained.
112
- def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
112
+ def index_the_web(max_sites_to_crawl = -1, max_data_size = 1_048_576_000)
113
113
  if max_sites_to_crawl < 0
114
114
  Wgit.logger.info("Indexing until the database has been filled or it runs out of \
115
115
  urls to crawl (which might be never).")
116
116
  end
117
117
  site_count = 0
118
118
 
119
- while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
119
+ while keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
120
120
  Wgit.logger.info("Current database size: #{@db.size}")
121
121
  @crawler.urls = @db.uncrawled_urls
122
122
 
123
123
  if @crawler.urls.empty?
124
- Wgit.logger.info("No urls to crawl, exiting.")
124
+ Wgit.logger.info('No urls to crawl, exiting.')
125
125
  return
126
126
  end
127
127
  Wgit.logger.info("Starting crawl loop for: #{@crawler.urls}")
@@ -181,9 +181,7 @@ iteration.")
181
181
 
182
182
  ext_urls = @crawler.crawl_site(url) do |doc|
183
183
  result = true
184
- if block_given?
185
- result = yield(doc)
186
- end
184
+ result = yield(doc) if block_given?
187
185
 
188
186
  if result
189
187
  if write_doc_to_db(doc)
@@ -221,9 +219,7 @@ site: #{url}")
221
219
  def index_this_page(url, insert_externals = true)
222
220
  document = @crawler.crawl_page(url) do |doc|
223
221
  result = true
224
- if block_given?
225
- result = yield(doc)
226
- end
222
+ result = yield(doc) if block_given?
227
223
 
228
224
  if result
229
225
  if write_doc_to_db(doc)
@@ -244,11 +240,20 @@ site: #{url}")
244
240
  nil
245
241
  end
246
242
 
247
- private
243
+ protected
248
244
 
249
- # Keep crawling or not based on DB size and current loop iteration.
245
+ # Returns whether or not to keep crawling based on the DB size and current
246
+ # loop iteration.
247
+ #
248
+ # @param site_count [Integer] The current number of crawled sites.
249
+ # @param max_sites_to_crawl [Integer] The maximum number of sites to crawl
250
+ # before stopping.
251
+ # @param max_data_size [Integer] The maximum amount of data to crawl before
252
+ # stopping.
253
+ # @return [Boolean] True if the crawl should continue, false otherwise.
250
254
  def keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
251
255
  return false if @db.size >= max_data_size
256
+
252
257
  # If max_sites_to_crawl is -1 for example then crawl away.
253
258
  if max_sites_to_crawl < 0
254
259
  true
@@ -257,8 +262,11 @@ site: #{url}")
257
262
  end
258
263
  end
259
264
 
260
- # The unique url index on the documents collection prevents duplicate
261
- # inserts.
265
+ # Write the doc to the DB. Note that the unique url index on the documents
266
+ # collection deliberately prevents duplicate inserts.
267
+ #
268
+ # @param doc [Wgit::Document] The document to write to the DB.
269
+ # @return [Boolean] True if the write was successful, false otherwise.
262
270
  def write_doc_to_db(doc)
263
271
  @db.insert(doc)
264
272
  Wgit.logger.info("Saved document for url: #{doc.url}")
@@ -268,18 +276,20 @@ site: #{url}")
268
276
  false
269
277
  end
270
278
 
271
- # The unique url index on the urls collection prevents duplicate inserts.
279
+ # Write the urls to the DB. Note that the unique url index on the urls
280
+ # collection deliberately prevents duplicate inserts.
281
+ #
282
+ # @param urls [Array<Wgit::Url>] The urls to write to the DB.
283
+ # @return [Boolean] True if the write was successful, false otherwise.
272
284
  def write_urls_to_db(urls)
273
285
  count = 0
274
286
  if urls.respond_to?(:each)
275
287
  urls.each do |url|
276
- begin
277
- @db.insert(url)
278
- count += 1
279
- Wgit.logger.info("Inserted url: #{url}")
280
- rescue Mongo::Error::OperationFailure
281
- Wgit.logger.info("Url already exists: #{url}")
282
- end
288
+ @db.insert(url)
289
+ count += 1
290
+ Wgit.logger.info("Inserted url: #{url}")
291
+ rescue Mongo::Error::OperationFailure
292
+ Wgit.logger.info("Url already exists: #{url}")
283
293
  end
284
294
  end
285
295
  count
data/lib/wgit/logger.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # FYI: The default logger is set at the bottom of this file.
2
4
 
3
5
  require 'logger'
@@ -24,7 +26,7 @@ module Wgit
24
26
  # @return [Logger] The default Logger instance.
25
27
  def self.default_logger
26
28
  logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
27
- logger.formatter = proc do |severity, datetime, progname, msg|
29
+ logger.formatter = proc do |_severity, _datetime, progname, msg|
28
30
  "[#{progname}] #{msg}\n"
29
31
  end
30
32
  logger
@@ -33,7 +35,7 @@ module Wgit
33
35
  # Sets the default Logger instance to be used by Wgit.
34
36
  # @return [Logger] The default Logger instance.
35
37
  def self.use_default_logger
36
- @logger = self.default_logger
38
+ @logger = default_logger
37
39
  end
38
40
  end
39
41
 
data/lib/wgit/url.rb CHANGED
@@ -1,10 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'utils'
2
4
  require_relative 'assertable'
3
5
  require 'uri'
4
6
  require 'addressable/uri'
5
7
 
6
8
  module Wgit
7
-
8
9
  # Class modeling a web based URL.
9
10
  # Can be an internal/relative link e.g. "about.html" or a full URL
10
11
  # e.g. "http://www.google.co.uk". Is a subclass of String and uses
@@ -12,8 +13,9 @@ module Wgit
12
13
  class Url < String
13
14
  include Assertable
14
15
 
15
- # Whether or not the Url has been crawled or not.
16
- attr_accessor :crawled
16
+ # Whether or not the Url has been crawled or not. A custom crawled= method
17
+ # is also provided by this class.
18
+ attr_reader :crawled
17
19
 
18
20
  # The date which the Url was crawled.
19
21
  attr_accessor :date_crawled
@@ -39,9 +41,9 @@ module Wgit
39
41
  obj = url_or_obj
40
42
  assert_respond_to(obj, [:fetch, :[]])
41
43
 
42
- url = obj.fetch("url") # Should always be present.
43
- crawled = obj.fetch("crawled", false)
44
- date_crawled = obj["date_crawled"]
44
+ url = obj.fetch('url') # Should always be present.
45
+ crawled = obj.fetch('crawled', false)
46
+ date_crawled = obj['date_crawled']
45
47
  end
46
48
 
47
49
  @uri = Addressable::URI.parse(url)
@@ -56,7 +58,7 @@ module Wgit
56
58
  # @param str [String] The URL string to parse.
57
59
  # @return [Wgit::Url] The parsed Url object.
58
60
  def self.parse(str)
59
- self.new(str)
61
+ new(str)
60
62
  end
61
63
 
62
64
  # Raises an exception if url is not a valid HTTP URL.
@@ -65,13 +67,11 @@ module Wgit
65
67
  # @raise [RuntimeError] If url is invalid.
66
68
  def self.validate(url)
67
69
  url = Wgit::Url.new(url)
68
- if url.relative_link?
69
- raise "Invalid url (or a relative link): #{url}"
70
- end
71
- unless url.start_with?("http://") or url.start_with?("https://")
70
+ raise "Invalid url (or a relative link): #{url}" if url.relative_link?
71
+ unless url.start_with?('http://') || url.start_with?('https://')
72
72
  raise "Invalid url (missing protocol prefix): #{url}"
73
73
  end
74
- if URI.regexp.match(url.normalise).nil?
74
+ if URI::DEFAULT_PARSER.make_regexp.match(url.normalise).nil?
75
75
  raise "Invalid url: #{url}"
76
76
  end
77
77
  end
@@ -83,7 +83,7 @@ module Wgit
83
83
  def self.valid?(url)
84
84
  Wgit::Url.validate(url)
85
85
  true
86
- rescue
86
+ rescue StandardError
87
87
  false
88
88
  end
89
89
 
@@ -95,7 +95,7 @@ module Wgit
95
95
  # @param https [Boolean] Whether the protocol prefix is https or http.
96
96
  # @return [Wgit::Url] The url with a protocol prefix.
97
97
  def self.prefix_protocol(url, https = false)
98
- unless url.start_with?("http://") or url.start_with?("https://")
98
+ unless url.start_with?('http://') || url.start_with?('https://')
99
99
  if https
100
100
  url.replace("https://#{url}")
101
101
  else
@@ -113,7 +113,7 @@ module Wgit
113
113
  def self.concat(host, link)
114
114
  host = Wgit::Url.new(host).without_trailing_slash
115
115
  link = Wgit::Url.new(link).without_leading_slash
116
- separator = (link.start_with?('#') or link.start_with?('?')) ? '' : '/'
116
+ separator = (link.start_with?('#') || link.start_with?('?')) ? '' : '/'
117
117
  Wgit::Url.new(host + separator + link)
118
118
  end
119
119
 
@@ -126,26 +126,35 @@ module Wgit
126
126
  super(new_url)
127
127
  end
128
128
 
129
- # Returns true if self is a relative Url.
129
+ # Returns true if self is a relative Url; false if absolute.
130
130
  #
131
131
  # All external links in a page are expected to have a protocol prefix e.g.
132
132
  # "http://", otherwise the link is treated as an internal link (regardless
133
- # of whether it's valid or not). The only exception is if host or domain is
134
- # provided and self is a page belonging to that host/domain; then the link
135
- # is relative.
133
+ # of whether it's valid or not). The only exception is if an opts arg is
134
+ # provided and self is a page belonging to that arg type e.g. domain; then
135
+ # the link is relative.
136
136
  #
137
- # @param host [Wgit::Url, String] The Url host e.g.
138
- # http://www.google.com/how which gives a host of www.google.com.
137
+ # @param opts [Hash] The options with which to check relativity.
138
+ # @option opts [Wgit::Url, String] :host The Url host e.g.
139
+ # http://www.google.com/how which gives a host of 'www.google.com'.
139
140
  # The host must be absolute and prefixed with a protocol.
140
- # @param domain [Wgit::Url, String] The Url domain e.g.
141
- # http://www.google.com/how which gives a domain of google.com. The
141
+ # @option opts [Wgit::Url, String] :domain The Url domain e.g.
142
+ # http://www.google.com/how which gives a domain of 'google.com'. The
142
143
  # domain must be absolute and prefixed with a protocol.
143
- # @return [Boolean] True if relative, false if absolute.
144
+ # @option opts [Wgit::Url, String] :brand The Url brand e.g.
145
+ # http://www.google.com/how which gives a domain of 'google'. The
146
+ # brand must be absolute and prefixed with a protocol.
144
147
  # @raise [RuntimeError] If self is invalid e.g. empty.
145
- def is_relative?(host: nil, domain: nil)
146
- raise "Invalid link: #{self}" if nil? or empty?
147
- raise "Provide host or domain, not both" if host and domain
148
+ # @return [Boolean] True if relative, false if absolute.
149
+ def is_relative?(opts = {})
150
+ opts = { host: nil, domain: nil, brand: nil }.merge(opts)
148
151
 
152
+ raise "Invalid link: '#{self}'" if empty?
153
+ if opts.values.count(nil) < (opts.length - 1)
154
+ raise "Provide only one of: #{opts.keys}"
155
+ end
156
+
157
+ host = opts[:host]
149
158
  if host
150
159
  host = Wgit::Url.new(host)
151
160
  if host.to_base.nil?
@@ -153,6 +162,7 @@ module Wgit
153
162
  end
154
163
  end
155
164
 
165
+ domain = opts[:domain]
156
166
  if domain
157
167
  domain = Wgit::Url.new(domain)
158
168
  if domain.to_base.nil?
@@ -160,11 +170,22 @@ module Wgit
160
170
  end
161
171
  end
162
172
 
173
+ brand = opts[:brand]
174
+ if brand
175
+ brand = Wgit::Url.new(brand)
176
+ if brand.to_base.nil?
177
+ raise "Invalid brand, must be absolute and contain protocol: #{brand}"
178
+ end
179
+ end
180
+
163
181
  if @uri.relative?
164
182
  true
165
183
  else
166
184
  return host ? to_host == host.to_host : false if host
167
185
  return domain ? to_domain == domain.to_domain : false if domain
186
+ return brand ? to_brand == brand.to_brand : false if brand
187
+
188
+ false
168
189
  end
169
190
  end
170
191
 
@@ -240,11 +261,21 @@ module Wgit
240
261
  domain ? Wgit::Url.new(domain) : nil
241
262
  end
242
263
 
264
+ # Returns a new Wgit::Url containing just the brand of this URL e.g.
265
+ # Given http://www.google.co.uk/about.html, google is returned.
266
+ #
267
+ # @return [Wgit::Url, nil] Containing just the brand or nil.
268
+ def to_brand
269
+ domain = to_domain
270
+ domain ? Wgit::Url.new(domain.split('.').first) : nil
271
+ end
272
+
243
273
  # Returns only the base of this URL e.g. the protocol and host combined.
244
274
  #
245
275
  # @return [Wgit::Url, nil] Base of self e.g. http://www.google.co.uk or nil.
246
276
  def to_base
247
- return nil if @uri.scheme.nil? or @uri.host.nil?
277
+ return nil if @uri.scheme.nil? || @uri.host.nil?
278
+
248
279
  base = "#{@uri.scheme}://#{@uri.host}"
249
280
  Wgit::Url.new(base)
250
281
  end
@@ -257,8 +288,9 @@ module Wgit
257
288
  # @return [Wgit::Url, nil] Path of self e.g. about.html or nil.
258
289
  def to_path
259
290
  path = @uri.path
260
- return nil if path.nil? or path.empty?
291
+ return nil if path.nil? || path.empty?
261
292
  return Wgit::Url.new('/') if path == '/'
293
+
262
294
  Wgit::Url.new(path).without_slashes
263
295
  end
264
296
 
@@ -300,6 +332,7 @@ module Wgit
300
332
  def to_extension
301
333
  path = to_path
302
334
  return nil unless path
335
+
303
336
  segs = path.split('.')
304
337
  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
305
338
  end
@@ -344,6 +377,7 @@ module Wgit
344
377
  without_base = base_url ? gsub(base_url, '') : self
345
378
 
346
379
  return self if ['', '/'].include?(without_base)
380
+
347
381
  Wgit::Url.new(without_base).without_slashes
348
382
  end
349
383
 
@@ -395,36 +429,37 @@ module Wgit
395
429
  #
396
430
  # @return [Hash] self's instance vars as a Hash.
397
431
  def to_h
398
- ignore = ["@uri"]
432
+ ignore = ['@uri']
399
433
  h = Wgit::Utils.to_h(self, ignore)
400
- Hash[h.to_a.insert(0, ["url", self])] # Insert url at position 0.
401
- end
402
-
403
- alias :uri :to_uri
404
- alias :url :to_url
405
- alias :scheme :to_scheme
406
- alias :to_protocol :to_scheme
407
- alias :protocol :to_scheme
408
- alias :host :to_host
409
- alias :domain :to_domain
410
- alias :base :to_base
411
- alias :path :to_path
412
- alias :endpoint :to_endpoint
413
- alias :query_string :to_query_string
414
- alias :query :to_query_string
415
- alias :anchor :to_anchor
416
- alias :to_fragment :to_anchor
417
- alias :fragment :to_anchor
418
- alias :extension :to_extension
419
- alias :without_query :without_query_string
420
- alias :without_fragment :without_anchor
421
- alias :is_query? :is_query_string?
422
- alias :is_fragment? :is_anchor?
423
- alias :relative_link? :is_relative?
424
- alias :internal_link? :is_relative?
425
- alias :is_internal? :is_relative?
426
- alias :relative? :is_relative?
427
- alias :crawled? :crawled
428
- alias :normalize :normalise
434
+ Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
435
+ end
436
+
437
+ alias uri to_uri
438
+ alias url to_url
439
+ alias scheme to_scheme
440
+ alias to_protocol to_scheme
441
+ alias protocol to_scheme
442
+ alias host to_host
443
+ alias domain to_domain
444
+ alias brand to_brand
445
+ alias base to_base
446
+ alias path to_path
447
+ alias endpoint to_endpoint
448
+ alias query_string to_query_string
449
+ alias query to_query_string
450
+ alias anchor to_anchor
451
+ alias to_fragment to_anchor
452
+ alias fragment to_anchor
453
+ alias extension to_extension
454
+ alias without_query without_query_string
455
+ alias without_fragment without_anchor
456
+ alias is_query? is_query_string?
457
+ alias is_fragment? is_anchor?
458
+ alias relative_link? is_relative?
459
+ alias internal_link? is_relative?
460
+ alias is_internal? is_relative?
461
+ alias relative? is_relative?
462
+ alias crawled? crawled
463
+ alias normalize normalise
429
464
  end
430
465
  end