wgit 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4d081963b841bc76f0d02e823d40b1155b9ccf46b77b1611bc19297d9a415e36
4
- data.tar.gz: 9568888d18bf1206c0fe082b7bf2176fe42bd1ea28e2e372d40295d1ae617408
3
+ metadata.gz: 81cc82cb5f9b408ca678b7b1731bac5531e72be11186d3c15d36bfbf61ed3838
4
+ data.tar.gz: b20d7c77389895b7a4dd303cb3a804aa0d1b099a6f98243144f6ae7f04094cd3
5
5
  SHA512:
6
- metadata.gz: 44742705e9e853d587dd3a2daa45c25c0acbd2e0630ddb5ec112f77bf52bd018103998a5a39e37c7d26e3ea6b7580bfb05a75430c48f2c87f8caf806410fb490
7
- data.tar.gz: b5a26fc1cf38c19c15cb75f56dca53327cf7e4c35da0d5a72d1dfa14c872d065714966c5a24936ed7a956f2ebbff27f06fff80b3395e7ea39308289c2c75f3f9
6
+ metadata.gz: c17a87d4bb10f750ea9d19faf16ed0a6de10caeab47d3586a3c2c633ec36a990c238ff9e119763c46868a81c111a4b19abd1cad8f09af7008c89474ff0bbf861
7
+ data.tar.gz: 3ad0950b664dc872bbc9394e6307974ace5a4ac35aac446aada4d470703360faac28efd911e48895708274581a82cb6c1b2ff1f02116122a2e25f817cd54b5c2
@@ -6,6 +6,7 @@ module Wgit
6
6
  DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s".freeze
7
7
  WRONG_METHOD_MSG = "arr must be Enumerable, use a different method".freeze
8
8
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s".freeze
9
+ DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not present: %s".freeze
9
10
 
10
11
  # Tests if the obj is of a given type.
11
12
  #
@@ -57,7 +58,20 @@ module Wgit
57
58
  end
58
59
  obj_or_objs
59
60
  end
60
-
61
+
62
+ # The hash must include? the keys or a KeyError is raised.
63
+ #
64
+ # @param hash [Hash] The hash which should include the required keys.
65
+ # @param keys [Array<String, Symbol>] The keys whose presence to assert.
66
+ # @param msg [String] The raised KeyError message, if provided.
67
+ # @return [Hash] The given hash on successful assertion.
68
+ def assert_required_keys(hash, keys, msg = nil)
69
+ msg ||= DEFAULT_REQUIRED_KEYS_MSG % [keys.join(', ')]
70
+ all_present = keys.all? { |key| hash.keys.include? key }
71
+ raise KeyError.new(msg) unless all_present
72
+ hash
73
+ end
74
+
61
75
  private
62
76
 
63
77
  # obj must respond_to? all methods or an exception is raised.
@@ -0,0 +1,47 @@
1
+ require_relative '../assertable'
2
+
3
+ module Wgit
4
+ extend Assertable
5
+
6
+ # The connection details for the database. This must be set if you want to
7
+ # store and access webpages in a database. Don't set the constant directly,
8
+ # instead use the funcs contained within the Wgit module.
9
+ CONNECTION_DETAILS = {}
10
+
11
+ # The keys required for a successful database connection.
12
+ CONNECTION_KEYS_REQUIRED = [
13
+ 'DB_HOST', 'DB_PORT', 'DB_USERNAME', 'DB_PASSWORD', 'DB_DATABASE'
14
+ ]
15
+
16
+ # Set the database's connection details from the given hash. It is your
17
+ # responsibility to ensure the correct hash vars are present and set.
18
+ #
19
+ # @param hash [Hash] Containing the database connection details to use.
20
+ # The hash should contain the following keys (of type String):
21
+ # DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
22
+ # @raise [KeyError] If any of the required connection details are missing.
23
+ # @return [Hash] Containing the database connection details from hash.
24
+ def self.set_connection_details(hash)
25
+ assert_required_keys(hash, CONNECTION_KEYS_REQUIRED)
26
+
27
+ CONNECTION_DETAILS[:host] = hash.fetch('DB_HOST')
28
+ CONNECTION_DETAILS[:port] = hash.fetch('DB_PORT')
29
+ CONNECTION_DETAILS[:uname] = hash.fetch('DB_USERNAME')
30
+ CONNECTION_DETAILS[:pword] = hash.fetch('DB_PASSWORD')
31
+ CONNECTION_DETAILS[:db] = hash.fetch('DB_DATABASE')
32
+
33
+ CONNECTION_DETAILS
34
+ end
35
+
36
+ # Set the database's connection details from the ENV. It is your
37
+ # responsibility to ensure the correct ENV vars are present and set.
38
+ #
39
+ # The ENV should contain the following keys (of type String):
40
+ # DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
41
+ #
42
+ # @raise [KeyError] If any of the required connection details are missing.
43
+ # @return [Hash] Containing the database connection details from the ENV.
44
+ def self.set_connection_details_from_env
45
+ self.set_connection_details(ENV)
46
+ end
47
+ end
@@ -3,6 +3,7 @@ require_relative '../url'
3
3
  require_relative '../utils'
4
4
  require_relative '../assertable'
5
5
  require_relative 'model'
6
+ require 'logger'
6
7
  require 'mongo'
7
8
 
8
9
  module Wgit
@@ -21,17 +22,19 @@ module Wgit
21
22
  raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
22
23
  :port, :db, :uname, :pword for a database connection to be established."
23
24
  end
24
-
25
- # Only log to STDOUT in fatal scenarios.
26
- Mongo::Logger.logger.level = Logger::FATAL
27
-
25
+
26
+ # Only log for error (or more severe) scenarios.
27
+ Mongo::Logger.logger = Wgit.logger.clone
28
+ Mongo::Logger.logger.progname = 'mongo'
29
+ Mongo::Logger.logger.level = Logger::ERROR
30
+
28
31
  address = "#{conn_details[:host]}:#{conn_details[:port]}"
29
32
  @@client = Mongo::Client.new([address],
30
- database: conn_details[:db],
31
- user: conn_details[:uname],
32
- password: conn_details[:pword])
33
+ database: conn_details[:db],
34
+ user: conn_details[:uname],
35
+ password: conn_details[:pword])
33
36
  end
34
-
37
+
35
38
  ### Create Data ###
36
39
 
37
40
  # Insert one or more Url or Document objects into the DB.
data/lib/wgit/document.rb CHANGED
@@ -24,7 +24,7 @@ module Wgit
24
24
  :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
25
25
  ]
26
26
 
27
- # The URL of the webpage, an instance of Wgit:Url.
27
+ # The URL of the webpage, an instance of Wgit::Url.
28
28
  attr_reader :url
29
29
 
30
30
  # The HTML of the webpage, an instance of String.
@@ -56,7 +56,7 @@ module Wgit
56
56
  # Init from URL String and HTML String.
57
57
  if url_or_obj.is_a?(String)
58
58
  url = url_or_obj
59
- assert_type(url, Url)
59
+ assert_type(url, Wgit::Url)
60
60
 
61
61
  @url = url
62
62
  @html = html ||= ""
data/lib/wgit/indexer.rb CHANGED
@@ -8,8 +8,8 @@ module Wgit
8
8
  #
9
9
  # Retrieves uncrawled url's from the database and recursively crawls each
10
10
  # site storing their internal pages into the database and adding their
11
- # external url's to be crawled at a later date. Puts out info on the crawl
12
- # to STDOUT as it goes along.
11
+ # external url's to be crawled later on. Logs info on the crawl
12
+ # using Wgit.logger as it goes along.
13
13
  #
14
14
  # @param max_sites_to_crawl [Integer] The number of separate and whole
15
15
  # websites to be crawled before the method exits. Defaults to -1 which
@@ -81,8 +81,8 @@ module Wgit
81
81
 
82
82
  # Retrieves uncrawled url's from the database and recursively crawls each
83
83
  # site storing their internal pages into the database and adding their
84
- # external url's to be crawled at a later date. Puts out info on the crawl
85
- # to STDOUT as it goes along.
84
+ # external url's to be crawled later on. Logs info on the crawl
85
+ # using Wgit.logger as it goes along.
86
86
  #
87
87
  # @param max_sites_to_crawl [Integer] The number of separate and whole
88
88
  # websites to be crawled before the method exits. Defaults to -1 which
@@ -93,28 +93,28 @@ module Wgit
93
93
  # that will be obtained.
94
94
  def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
95
95
  if max_sites_to_crawl < 0
96
- puts "Indexing until the database has been filled or it runs out of \
97
- urls to crawl (which might be never)."
96
+ Wgit.logger.info("Indexing until the database has been filled or it runs out of \
97
+ urls to crawl (which might be never).")
98
98
  end
99
99
  site_count = 0
100
100
 
101
101
  while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
102
- puts "Current database size: #{@db.size}"
102
+ Wgit.logger.info("Current database size: #{@db.size}")
103
103
  @crawler.urls = @db.uncrawled_urls
104
104
 
105
105
  if @crawler.urls.empty?
106
- puts "No urls to crawl, exiting."
106
+ Wgit.logger.info("No urls to crawl, exiting.")
107
107
  return
108
108
  end
109
- puts "Starting crawl loop for: #{@crawler.urls}"
109
+ Wgit.logger.info("Starting crawl loop for: #{@crawler.urls}")
110
110
 
111
111
  docs_count = 0
112
112
  urls_count = 0
113
113
 
114
114
  @crawler.urls.each do |url|
115
115
  unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
116
- puts "Reached max number of sites to crawl or database \
117
- capacity, exiting."
116
+ Wgit.logger.info("Reached max number of sites to crawl or database \
117
+ capacity, exiting.")
118
118
  return
119
119
  end
120
120
  site_count += 1
@@ -133,20 +133,20 @@ capacity, exiting."
133
133
  end
134
134
 
135
135
  urls_count += write_urls_to_db(ext_links)
136
- puts "Crawled and saved #{site_docs_count} docs for the \
137
- site: #{url}"
136
+ Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
137
+ site: #{url}")
138
138
  end
139
139
 
140
- puts "Crawled and saved docs for #{docs_count} url(s) overall for \
141
- this iteration."
142
- puts "Found and saved #{urls_count} external url(s) for the next \
143
- iteration."
140
+ Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) overall for \
141
+ this iteration.")
142
+ Wgit.logger.info("Found and saved #{urls_count} external url(s) for the next \
143
+ iteration.")
144
144
  end
145
145
  end
146
146
 
147
147
  # Crawls a single website's pages and stores them into the database.
148
148
  # There is no max download limit so be careful which sites you index.
149
- # Puts out info on the crawl to STDOUT as it goes along.
149
+ # Logs info on the crawl using Wgit.logger as it goes along.
150
150
  #
151
151
  # @param url [Wgit::Url] The base Url of the website to crawl.
152
152
  # @param insert_externals [Boolean] Whether or not to insert the website's
@@ -168,7 +168,7 @@ iteration."
168
168
  if result
169
169
  if write_doc_to_db(doc)
170
170
  total_pages_indexed += 1
171
- puts "Crawled and saved internal page: #{doc.url}"
171
+ Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
172
172
  end
173
173
  end
174
174
  end
@@ -182,11 +182,11 @@ iteration."
182
182
 
183
183
  if insert_externals
184
184
  write_urls_to_db(ext_urls)
185
- puts "Found and saved #{ext_urls.length} external url(s)"
185
+ Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
186
186
  end
187
187
 
188
- puts "Crawled and saved #{total_pages_indexed} docs for the \
189
- site: #{url}"
188
+ Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
189
+ site: #{url}")
190
190
 
191
191
  total_pages_indexed
192
192
  end
@@ -208,10 +208,10 @@ site: #{url}"
208
208
  # inserts.
209
209
  def write_doc_to_db(doc)
210
210
  @db.insert(doc)
211
- puts "Saved document for url: #{doc.url}"
211
+ Wgit.logger.info("Saved document for url: #{doc.url}")
212
212
  true
213
213
  rescue Mongo::Error::OperationFailure
214
- puts "Document already exists: #{doc.url}"
214
+ Wgit.logger.info("Document already exists: #{doc.url}")
215
215
  false
216
216
  end
217
217
 
@@ -223,9 +223,9 @@ site: #{url}"
223
223
  begin
224
224
  @db.insert(url)
225
225
  count += 1
226
- puts "Inserted url: #{url}"
226
+ Wgit.logger.info("Inserted url: #{url}")
227
227
  rescue Mongo::Error::OperationFailure
228
- puts "Url already exists: #{url}"
228
+ Wgit.logger.info("Url already exists: #{url}")
229
229
  end
230
230
  end
231
231
  end
@@ -0,0 +1,36 @@
1
+ # FYI: The default logger is set at the bottom of this file.
2
+
3
+ require 'logger'
4
+
5
+ module Wgit
6
+ # The Logger instance used by Wgit. Set your own custom logger after
7
+ # requiring this file if needed.
8
+ @logger = nil
9
+
10
+ # Returns the current Logger instance.
11
+ # @return [Logger] The current Logger instance.
12
+ def self.logger
13
+ @logger
14
+ end
15
+
16
+ # Sets the current Logger instance.
17
+ # @param logger [Logger] The Logger instance to use.
18
+ # @return [Logger] The current Logger instance having being set.
19
+ def self.logger=(logger)
20
+ @logger = logger
21
+ end
22
+
23
+ # Returns the default Logger instance.
24
+ # @return [Logger] The default Logger instance.
25
+ def self.default_logger
26
+ Logger.new(STDOUT, progname: 'wgit', level: :info)
27
+ end
28
+
29
+ # Sets the default Logger instance to be used by Wgit.
30
+ # @return [Logger] The default Logger instance.
31
+ def self.use_default_logger
32
+ @logger = self.default_logger
33
+ end
34
+ end
35
+
36
+ Wgit.use_default_logger
data/lib/wgit/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # @author Michael Telford
4
4
  module Wgit
5
5
  # The current gem version of Wgit.
6
- VERSION = "0.0.6".freeze
6
+ VERSION = "0.0.7".freeze
7
7
  end
data/lib/wgit.rb CHANGED
@@ -1,11 +1,12 @@
1
1
  require_relative 'wgit/version'
2
- require_relative 'wgit/crawler'
3
- require_relative 'wgit/indexer'
2
+ require_relative 'wgit/logger'
3
+ require_relative 'wgit/assertable'
4
+ require_relative 'wgit/utils'
4
5
  require_relative 'wgit/url'
5
6
  require_relative 'wgit/document'
6
- require_relative 'wgit/utils'
7
- require_relative 'wgit/assertable'
8
- require_relative 'wgit/database/database'
7
+ require_relative 'wgit/crawler'
8
+ require_relative 'wgit/database/connection_details'
9
9
  require_relative 'wgit/database/model'
10
- require_relative 'wgit/database/mongo_connection_details'
11
- #require_relative 'wgit/core_ext'
10
+ require_relative 'wgit/database/database'
11
+ require_relative 'wgit/indexer'
12
+ #require_relative 'wgit/core_ext' - Must be explicitly required.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
96
  version: '12.3'
97
+ - !ruby/object:Gem::Dependency
98
+ name: httplog
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.3'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.3'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: nokogiri
99
113
  requirement: !ruby/object:Gem::Requirement
@@ -138,11 +152,12 @@ files:
138
152
  - "./lib/wgit/assertable.rb"
139
153
  - "./lib/wgit/core_ext.rb"
140
154
  - "./lib/wgit/crawler.rb"
155
+ - "./lib/wgit/database/connection_details.rb"
141
156
  - "./lib/wgit/database/database.rb"
142
157
  - "./lib/wgit/database/model.rb"
143
- - "./lib/wgit/database/mongo_connection_details.rb"
144
158
  - "./lib/wgit/document.rb"
145
159
  - "./lib/wgit/indexer.rb"
160
+ - "./lib/wgit/logger.rb"
146
161
  - "./lib/wgit/url.rb"
147
162
  - "./lib/wgit/utils.rb"
148
163
  - "./lib/wgit/version.rb"
@@ -1,48 +0,0 @@
1
- module Wgit
2
- # The connection details for the database. This must be set if you want to
3
- # store and access webpages in a database. Don't set the constant directly,
4
- # instead use the funcs contained within the Wgit module.
5
- CONNECTION_DETAILS = {}
6
-
7
- # Set the database's connection details from the given hash and freeze them.
8
- # It is your responsibility to ensure the correct hash vars are present and
9
- # set. Due to the freezing of the CONNECTION_DETAILS, this func is designed
10
- # to be called only once.
11
- #
12
- # @param hash [Hash] Containing the database connection details to use.
13
- # The hash should contain the following keys (of type String):
14
- # host, port, uname, pword, db
15
- # @raise [KeyError, FrozenError] If any of the required connection
16
- # details are missing or if the connection details have already been set.
17
- # @return [Hash] Containing the database connection details from hash.
18
- def self.set_connection_details(hash)
19
- CONNECTION_DETAILS[:host] = hash.fetch('host')
20
- CONNECTION_DETAILS[:port] = hash.fetch('port')
21
- CONNECTION_DETAILS[:uname] = hash.fetch('uname')
22
- CONNECTION_DETAILS[:pword] = hash.fetch('pword')
23
- CONNECTION_DETAILS[:db] = hash.fetch('db')
24
-
25
- CONNECTION_DETAILS.freeze
26
- end
27
-
28
- # Set the database's connection details from the ENV and freeze them. It is
29
- # your responsibility to ensure the correct ENV vars are present and set.
30
- # Due to the freezing of the CONNECTION_DETAILS, this func is designed to be
31
- # called only once.
32
- #
33
- # The ENV should contain the following keys (of type String):
34
- # DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
35
- #
36
- # @raise [KeyError, FrozenError] If any of the required connection
37
- # details are missing or if the connection details have already been set.
38
- # @return [Hash] Containing the database connection details from the ENV.
39
- def self.set_connection_details_from_env
40
- CONNECTION_DETAILS[:host] = ENV.fetch('DB_HOST')
41
- CONNECTION_DETAILS[:port] = ENV.fetch('DB_PORT')
42
- CONNECTION_DETAILS[:uname] = ENV.fetch('DB_USERNAME')
43
- CONNECTION_DETAILS[:pword] = ENV.fetch('DB_PASSWORD')
44
- CONNECTION_DETAILS[:db] = ENV.fetch('DB_DATABASE')
45
-
46
- CONNECTION_DETAILS.freeze
47
- end
48
- end