wgit 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4d081963b841bc76f0d02e823d40b1155b9ccf46b77b1611bc19297d9a415e36
4
- data.tar.gz: 9568888d18bf1206c0fe082b7bf2176fe42bd1ea28e2e372d40295d1ae617408
3
+ metadata.gz: 81cc82cb5f9b408ca678b7b1731bac5531e72be11186d3c15d36bfbf61ed3838
4
+ data.tar.gz: b20d7c77389895b7a4dd303cb3a804aa0d1b099a6f98243144f6ae7f04094cd3
5
5
  SHA512:
6
- metadata.gz: 44742705e9e853d587dd3a2daa45c25c0acbd2e0630ddb5ec112f77bf52bd018103998a5a39e37c7d26e3ea6b7580bfb05a75430c48f2c87f8caf806410fb490
7
- data.tar.gz: b5a26fc1cf38c19c15cb75f56dca53327cf7e4c35da0d5a72d1dfa14c872d065714966c5a24936ed7a956f2ebbff27f06fff80b3395e7ea39308289c2c75f3f9
6
+ metadata.gz: c17a87d4bb10f750ea9d19faf16ed0a6de10caeab47d3586a3c2c633ec36a990c238ff9e119763c46868a81c111a4b19abd1cad8f09af7008c89474ff0bbf861
7
+ data.tar.gz: 3ad0950b664dc872bbc9394e6307974ace5a4ac35aac446aada4d470703360faac28efd911e48895708274581a82cb6c1b2ff1f02116122a2e25f817cd54b5c2
@@ -6,6 +6,7 @@ module Wgit
6
6
  DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s".freeze
7
7
  WRONG_METHOD_MSG = "arr must be Enumerable, use a different method".freeze
8
8
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s".freeze
9
+ DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not present: %s".freeze
9
10
 
10
11
  # Tests if the obj is of a given type.
11
12
  #
@@ -57,7 +58,20 @@ module Wgit
57
58
  end
58
59
  obj_or_objs
59
60
  end
60
-
61
+
62
+ # The hash must include? the keys or a KeyError is raised.
63
+ #
64
+ # @param hash [Hash] The hash which should include the required keys.
65
+ # @param keys [Array<String, Symbol>] The keys whose presence to assert.
66
+ # @param msg [String] The raised KeyError message, if provided.
67
+ # @return [Hash] The given hash on successful assertion.
68
+ def assert_required_keys(hash, keys, msg = nil)
69
+ msg ||= DEFAULT_REQUIRED_KEYS_MSG % [keys.join(', ')]
70
+ all_present = keys.all? { |key| hash.keys.include? key }
71
+ raise KeyError.new(msg) unless all_present
72
+ hash
73
+ end
74
+
61
75
  private
62
76
 
63
77
  # obj must respond_to? all methods or an exception is raised.
@@ -0,0 +1,47 @@
1
+ require_relative '../assertable'
2
+
3
+ module Wgit
4
+ extend Assertable
5
+
6
+ # The connection details for the database. This must be set if you want to
7
+ # store and access webpages in a database. Don't set the constant directly,
8
+ # instead use the funcs contained within the Wgit module.
9
+ CONNECTION_DETAILS = {}
10
+
11
+ # The keys required for a successful database connection.
12
+ CONNECTION_KEYS_REQUIRED = [
13
+ 'DB_HOST', 'DB_PORT', 'DB_USERNAME', 'DB_PASSWORD', 'DB_DATABASE'
14
+ ]
15
+
16
+ # Set the database's connection details from the given hash. It is your
17
+ # responsibility to ensure the correct hash vars are present and set.
18
+ #
19
+ # @param hash [Hash] Containing the database connection details to use.
20
+ # The hash should contain the following keys (of type String):
21
+ # DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
22
+ # @raise [KeyError] If any of the required connection details are missing.
23
+ # @return [Hash] Containing the database connection details from hash.
24
+ def self.set_connection_details(hash)
25
+ assert_required_keys(hash, CONNECTION_KEYS_REQUIRED)
26
+
27
+ CONNECTION_DETAILS[:host] = hash.fetch('DB_HOST')
28
+ CONNECTION_DETAILS[:port] = hash.fetch('DB_PORT')
29
+ CONNECTION_DETAILS[:uname] = hash.fetch('DB_USERNAME')
30
+ CONNECTION_DETAILS[:pword] = hash.fetch('DB_PASSWORD')
31
+ CONNECTION_DETAILS[:db] = hash.fetch('DB_DATABASE')
32
+
33
+ CONNECTION_DETAILS
34
+ end
35
+
36
+ # Set the database's connection details from the ENV. It is your
37
+ # responsibility to ensure the correct ENV vars are present and set.
38
+ #
39
+ # The ENV should contain the following keys (of type String):
40
+ # DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
41
+ #
42
+ # @raise [KeyError] If any of the required connection details are missing.
43
+ # @return [Hash] Containing the database connection details from the ENV.
44
+ def self.set_connection_details_from_env
45
+ self.set_connection_details(ENV)
46
+ end
47
+ end
@@ -3,6 +3,7 @@ require_relative '../url'
3
3
  require_relative '../utils'
4
4
  require_relative '../assertable'
5
5
  require_relative 'model'
6
+ require 'logger'
6
7
  require 'mongo'
7
8
 
8
9
  module Wgit
@@ -21,17 +22,19 @@ module Wgit
21
22
  raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
22
23
  :port, :db, :uname, :pword for a database connection to be established."
23
24
  end
24
-
25
- # Only log to STDOUT in fatal scenarios.
26
- Mongo::Logger.logger.level = Logger::FATAL
27
-
25
+
26
+ # Only log for error (or more severe) scenarios.
27
+ Mongo::Logger.logger = Wgit.logger.clone
28
+ Mongo::Logger.logger.progname = 'mongo'
29
+ Mongo::Logger.logger.level = Logger::ERROR
30
+
28
31
  address = "#{conn_details[:host]}:#{conn_details[:port]}"
29
32
  @@client = Mongo::Client.new([address],
30
- database: conn_details[:db],
31
- user: conn_details[:uname],
32
- password: conn_details[:pword])
33
+ database: conn_details[:db],
34
+ user: conn_details[:uname],
35
+ password: conn_details[:pword])
33
36
  end
34
-
37
+
35
38
  ### Create Data ###
36
39
 
37
40
  # Insert one or more Url or Document objects into the DB.
data/lib/wgit/document.rb CHANGED
@@ -24,7 +24,7 @@ module Wgit
24
24
  :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
25
25
  ]
26
26
 
27
- # The URL of the webpage, an instance of Wgit:Url.
27
+ # The URL of the webpage, an instance of Wgit::Url.
28
28
  attr_reader :url
29
29
 
30
30
  # The HTML of the webpage, an instance of String.
@@ -56,7 +56,7 @@ module Wgit
56
56
  # Init from URL String and HTML String.
57
57
  if url_or_obj.is_a?(String)
58
58
  url = url_or_obj
59
- assert_type(url, Url)
59
+ assert_type(url, Wgit::Url)
60
60
 
61
61
  @url = url
62
62
  @html = html ||= ""
data/lib/wgit/indexer.rb CHANGED
@@ -8,8 +8,8 @@ module Wgit
8
8
  #
9
9
  # Retrieves uncrawled url's from the database and recursively crawls each
10
10
  # site storing their internal pages into the database and adding their
11
- # external url's to be crawled at a later date. Puts out info on the crawl
12
- # to STDOUT as it goes along.
11
+ # external url's to be crawled later on. Logs info on the crawl
12
+ # using Wgit.logger as it goes along.
13
13
  #
14
14
  # @param max_sites_to_crawl [Integer] The number of separate and whole
15
15
  # websites to be crawled before the method exits. Defaults to -1 which
@@ -81,8 +81,8 @@ module Wgit
81
81
 
82
82
  # Retrieves uncrawled url's from the database and recursively crawls each
83
83
  # site storing their internal pages into the database and adding their
84
- # external url's to be crawled at a later date. Puts out info on the crawl
85
- # to STDOUT as it goes along.
84
+ # external url's to be crawled later on. Logs info on the crawl
85
+ # using Wgit.logger as it goes along.
86
86
  #
87
87
  # @param max_sites_to_crawl [Integer] The number of separate and whole
88
88
  # websites to be crawled before the method exits. Defaults to -1 which
@@ -93,28 +93,28 @@ module Wgit
93
93
  # that will be obtained.
94
94
  def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
95
95
  if max_sites_to_crawl < 0
96
- puts "Indexing until the database has been filled or it runs out of \
97
- urls to crawl (which might be never)."
96
+ Wgit.logger.info("Indexing until the database has been filled or it runs out of \
97
+ urls to crawl (which might be never).")
98
98
  end
99
99
  site_count = 0
100
100
 
101
101
  while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
102
- puts "Current database size: #{@db.size}"
102
+ Wgit.logger.info("Current database size: #{@db.size}")
103
103
  @crawler.urls = @db.uncrawled_urls
104
104
 
105
105
  if @crawler.urls.empty?
106
- puts "No urls to crawl, exiting."
106
+ Wgit.logger.info("No urls to crawl, exiting.")
107
107
  return
108
108
  end
109
- puts "Starting crawl loop for: #{@crawler.urls}"
109
+ Wgit.logger.info("Starting crawl loop for: #{@crawler.urls}")
110
110
 
111
111
  docs_count = 0
112
112
  urls_count = 0
113
113
 
114
114
  @crawler.urls.each do |url|
115
115
  unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
116
- puts "Reached max number of sites to crawl or database \
117
- capacity, exiting."
116
+ Wgit.logger.info("Reached max number of sites to crawl or database \
117
+ capacity, exiting.")
118
118
  return
119
119
  end
120
120
  site_count += 1
@@ -133,20 +133,20 @@ capacity, exiting."
133
133
  end
134
134
 
135
135
  urls_count += write_urls_to_db(ext_links)
136
- puts "Crawled and saved #{site_docs_count} docs for the \
137
- site: #{url}"
136
+ Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
137
+ site: #{url}")
138
138
  end
139
139
 
140
- puts "Crawled and saved docs for #{docs_count} url(s) overall for \
141
- this iteration."
142
- puts "Found and saved #{urls_count} external url(s) for the next \
143
- iteration."
140
+ Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) overall for \
141
+ this iteration.")
142
+ Wgit.logger.info("Found and saved #{urls_count} external url(s) for the next \
143
+ iteration.")
144
144
  end
145
145
  end
146
146
 
147
147
  # Crawls a single website's pages and stores them into the database.
148
148
  # There is no max download limit so be careful which sites you index.
149
- # Puts out info on the crawl to STDOUT as it goes along.
149
+ # Logs info on the crawl using Wgit.logger as it goes along.
150
150
  #
151
151
  # @param url [Wgit::Url] The base Url of the website to crawl.
152
152
  # @param insert_externals [Boolean] Whether or not to insert the website's
@@ -168,7 +168,7 @@ iteration."
168
168
  if result
169
169
  if write_doc_to_db(doc)
170
170
  total_pages_indexed += 1
171
- puts "Crawled and saved internal page: #{doc.url}"
171
+ Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
172
172
  end
173
173
  end
174
174
  end
@@ -182,11 +182,11 @@ iteration."
182
182
 
183
183
  if insert_externals
184
184
  write_urls_to_db(ext_urls)
185
- puts "Found and saved #{ext_urls.length} external url(s)"
185
+ Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
186
186
  end
187
187
 
188
- puts "Crawled and saved #{total_pages_indexed} docs for the \
189
- site: #{url}"
188
+ Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
189
+ site: #{url}")
190
190
 
191
191
  total_pages_indexed
192
192
  end
@@ -208,10 +208,10 @@ site: #{url}"
208
208
  # inserts.
209
209
  def write_doc_to_db(doc)
210
210
  @db.insert(doc)
211
- puts "Saved document for url: #{doc.url}"
211
+ Wgit.logger.info("Saved document for url: #{doc.url}")
212
212
  true
213
213
  rescue Mongo::Error::OperationFailure
214
- puts "Document already exists: #{doc.url}"
214
+ Wgit.logger.info("Document already exists: #{doc.url}")
215
215
  false
216
216
  end
217
217
 
@@ -223,9 +223,9 @@ site: #{url}"
223
223
  begin
224
224
  @db.insert(url)
225
225
  count += 1
226
- puts "Inserted url: #{url}"
226
+ Wgit.logger.info("Inserted url: #{url}")
227
227
  rescue Mongo::Error::OperationFailure
228
- puts "Url already exists: #{url}"
228
+ Wgit.logger.info("Url already exists: #{url}")
229
229
  end
230
230
  end
231
231
  end
@@ -0,0 +1,36 @@
1
+ # FYI: The default logger is set at the bottom of this file.
2
+
3
+ require 'logger'
4
+
5
+ module Wgit
6
+ # The Logger instance used by Wgit. Set your own custom logger after
7
+ # requiring this file if needed.
8
+ @logger = nil
9
+
10
+ # Returns the current Logger instance.
11
+ # @return [Logger] The current Logger instance.
12
+ def self.logger
13
+ @logger
14
+ end
15
+
16
+ # Sets the current Logger instance.
17
+ # @param logger [Logger] The Logger instance to use.
18
+ # @return [Logger] The current Logger instance having being set.
19
+ def self.logger=(logger)
20
+ @logger = logger
21
+ end
22
+
23
+ # Returns the default Logger instance.
24
+ # @return [Logger] The default Logger instance.
25
+ def self.default_logger
26
+ Logger.new(STDOUT, progname: 'wgit', level: :info)
27
+ end
28
+
29
+ # Sets the default Logger instance to be used by Wgit.
30
+ # @return [Logger] The default Logger instance.
31
+ def self.use_default_logger
32
+ @logger = self.default_logger
33
+ end
34
+ end
35
+
36
+ Wgit.use_default_logger
data/lib/wgit/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # @author Michael Telford
4
4
  module Wgit
5
5
  # The current gem version of Wgit.
6
- VERSION = "0.0.6".freeze
6
+ VERSION = "0.0.7".freeze
7
7
  end
data/lib/wgit.rb CHANGED
@@ -1,11 +1,12 @@
1
1
  require_relative 'wgit/version'
2
- require_relative 'wgit/crawler'
3
- require_relative 'wgit/indexer'
2
+ require_relative 'wgit/logger'
3
+ require_relative 'wgit/assertable'
4
+ require_relative 'wgit/utils'
4
5
  require_relative 'wgit/url'
5
6
  require_relative 'wgit/document'
6
- require_relative 'wgit/utils'
7
- require_relative 'wgit/assertable'
8
- require_relative 'wgit/database/database'
7
+ require_relative 'wgit/crawler'
8
+ require_relative 'wgit/database/connection_details'
9
9
  require_relative 'wgit/database/model'
10
- require_relative 'wgit/database/mongo_connection_details'
11
- #require_relative 'wgit/core_ext'
10
+ require_relative 'wgit/database/database'
11
+ require_relative 'wgit/indexer'
12
+ #require_relative 'wgit/core_ext' - Must be explicitly required.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
96
  version: '12.3'
97
+ - !ruby/object:Gem::Dependency
98
+ name: httplog
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.3'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.3'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: nokogiri
99
113
  requirement: !ruby/object:Gem::Requirement
@@ -138,11 +152,12 @@ files:
138
152
  - "./lib/wgit/assertable.rb"
139
153
  - "./lib/wgit/core_ext.rb"
140
154
  - "./lib/wgit/crawler.rb"
155
+ - "./lib/wgit/database/connection_details.rb"
141
156
  - "./lib/wgit/database/database.rb"
142
157
  - "./lib/wgit/database/model.rb"
143
- - "./lib/wgit/database/mongo_connection_details.rb"
144
158
  - "./lib/wgit/document.rb"
145
159
  - "./lib/wgit/indexer.rb"
160
+ - "./lib/wgit/logger.rb"
146
161
  - "./lib/wgit/url.rb"
147
162
  - "./lib/wgit/utils.rb"
148
163
  - "./lib/wgit/version.rb"
@@ -1,48 +0,0 @@
1
- module Wgit
2
- # The connection details for the database. This must be set if you want to
3
- # store and access webpages in a database. Don't set the constant directly,
4
- # instead use the funcs contained within the Wgit module.
5
- CONNECTION_DETAILS = {}
6
-
7
- # Set the database's connection details from the given hash and freeze them.
8
- # It is your responsibility to ensure the correct hash vars are present and
9
- # set. Due to the freezing of the CONNECTION_DETAILS, this func is designed
10
- # to be called only once.
11
- #
12
- # @param hash [Hash] Containing the database connection details to use.
13
- # The hash should contain the following keys (of type String):
14
- # host, port, uname, pword, db
15
- # @raise [KeyError, FrozenError] If any of the required connection
16
- # details are missing or if the connection details have already been set.
17
- # @return [Hash] Containing the database connection details from hash.
18
- def self.set_connection_details(hash)
19
- CONNECTION_DETAILS[:host] = hash.fetch('host')
20
- CONNECTION_DETAILS[:port] = hash.fetch('port')
21
- CONNECTION_DETAILS[:uname] = hash.fetch('uname')
22
- CONNECTION_DETAILS[:pword] = hash.fetch('pword')
23
- CONNECTION_DETAILS[:db] = hash.fetch('db')
24
-
25
- CONNECTION_DETAILS.freeze
26
- end
27
-
28
- # Set the database's connection details from the ENV and freeze them. It is
29
- # your responsibility to ensure the correct ENV vars are present and set.
30
- # Due to the freezing of the CONNECTION_DETAILS, this func is designed to be
31
- # called only once.
32
- #
33
- # The ENV should contain the following keys (of type String):
34
- # DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
35
- #
36
- # @raise [KeyError, FrozenError] If any of the required connection
37
- # details are missing or if the connection details have already been set.
38
- # @return [Hash] Containing the database connection details from the ENV.
39
- def self.set_connection_details_from_env
40
- CONNECTION_DETAILS[:host] = ENV.fetch('DB_HOST')
41
- CONNECTION_DETAILS[:port] = ENV.fetch('DB_PORT')
42
- CONNECTION_DETAILS[:uname] = ENV.fetch('DB_USERNAME')
43
- CONNECTION_DETAILS[:pword] = ENV.fetch('DB_PASSWORD')
44
- CONNECTION_DETAILS[:db] = ENV.fetch('DB_DATABASE')
45
-
46
- CONNECTION_DETAILS.freeze
47
- end
48
- end