wgit 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit/assertable.rb +15 -1
- data/lib/wgit/database/connection_details.rb +47 -0
- data/lib/wgit/database/database.rb +11 -8
- data/lib/wgit/document.rb +2 -2
- data/lib/wgit/indexer.rb +26 -26
- data/lib/wgit/logger.rb +36 -0
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +8 -7
- metadata +17 -2
- data/lib/wgit/database/mongo_connection_details.rb +0 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 81cc82cb5f9b408ca678b7b1731bac5531e72be11186d3c15d36bfbf61ed3838
|
4
|
+
data.tar.gz: b20d7c77389895b7a4dd303cb3a804aa0d1b099a6f98243144f6ae7f04094cd3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c17a87d4bb10f750ea9d19faf16ed0a6de10caeab47d3586a3c2c633ec36a990c238ff9e119763c46868a81c111a4b19abd1cad8f09af7008c89474ff0bbf861
|
7
|
+
data.tar.gz: 3ad0950b664dc872bbc9394e6307974ace5a4ac35aac446aada4d470703360faac28efd911e48895708274581a82cb6c1b2ff1f02116122a2e25f817cd54b5c2
|
data/lib/wgit/assertable.rb
CHANGED
@@ -6,6 +6,7 @@ module Wgit
|
|
6
6
|
DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s".freeze
|
7
7
|
WRONG_METHOD_MSG = "arr must be Enumerable, use a different method".freeze
|
8
8
|
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s".freeze
|
9
|
+
DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not present: %s".freeze
|
9
10
|
|
10
11
|
# Tests if the obj is of a given type.
|
11
12
|
#
|
@@ -57,7 +58,20 @@ module Wgit
|
|
57
58
|
end
|
58
59
|
obj_or_objs
|
59
60
|
end
|
60
|
-
|
61
|
+
|
62
|
+
# The hash must include? the keys or a KeyError is raised.
|
63
|
+
#
|
64
|
+
# @param hash [Hash] The hash which should include the required keys.
|
65
|
+
# @param keys [Array<String, Symbol>] The keys whose presence to assert.
|
66
|
+
# @param msg [String] The raised KeyError message, if provided.
|
67
|
+
# @return [Hash] The given hash on successful assertion.
|
68
|
+
def assert_required_keys(hash, keys, msg = nil)
|
69
|
+
msg ||= DEFAULT_REQUIRED_KEYS_MSG % [keys.join(', ')]
|
70
|
+
all_present = keys.all? { |key| hash.keys.include? key }
|
71
|
+
raise KeyError.new(msg) unless all_present
|
72
|
+
hash
|
73
|
+
end
|
74
|
+
|
61
75
|
private
|
62
76
|
|
63
77
|
# obj must respond_to? all methods or an exception is raised.
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require_relative '../assertable'
|
2
|
+
|
3
|
+
module Wgit
|
4
|
+
extend Assertable
|
5
|
+
|
6
|
+
# The connection details for the database. This must be set if you want to
|
7
|
+
# store and access webpages in a database. Don't set the constant directly,
|
8
|
+
# instead use the funcs contained within the Wgit module.
|
9
|
+
CONNECTION_DETAILS = {}
|
10
|
+
|
11
|
+
# The keys required for a successful database connection.
|
12
|
+
CONNECTION_KEYS_REQUIRED = [
|
13
|
+
'DB_HOST', 'DB_PORT', 'DB_USERNAME', 'DB_PASSWORD', 'DB_DATABASE'
|
14
|
+
]
|
15
|
+
|
16
|
+
# Set the database's connection details from the given hash. It is your
|
17
|
+
# responsibility to ensure the correct hash vars are present and set.
|
18
|
+
#
|
19
|
+
# @param hash [Hash] Containing the database connection details to use.
|
20
|
+
# The hash should contain the following keys (of type String):
|
21
|
+
# DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
|
22
|
+
# @raise [KeyError] If any of the required connection details are missing.
|
23
|
+
# @return [Hash] Containing the database connection details from hash.
|
24
|
+
def self.set_connection_details(hash)
|
25
|
+
assert_required_keys(hash, CONNECTION_KEYS_REQUIRED)
|
26
|
+
|
27
|
+
CONNECTION_DETAILS[:host] = hash.fetch('DB_HOST')
|
28
|
+
CONNECTION_DETAILS[:port] = hash.fetch('DB_PORT')
|
29
|
+
CONNECTION_DETAILS[:uname] = hash.fetch('DB_USERNAME')
|
30
|
+
CONNECTION_DETAILS[:pword] = hash.fetch('DB_PASSWORD')
|
31
|
+
CONNECTION_DETAILS[:db] = hash.fetch('DB_DATABASE')
|
32
|
+
|
33
|
+
CONNECTION_DETAILS
|
34
|
+
end
|
35
|
+
|
36
|
+
# Set the database's connection details from the ENV. It is your
|
37
|
+
# responsibility to ensure the correct ENV vars are present and set.
|
38
|
+
#
|
39
|
+
# The ENV should contain the following keys (of type String):
|
40
|
+
# DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
|
41
|
+
#
|
42
|
+
# @raise [KeyError] If any of the required connection details are missing.
|
43
|
+
# @return [Hash] Containing the database connection details from the ENV.
|
44
|
+
def self.set_connection_details_from_env
|
45
|
+
self.set_connection_details(ENV)
|
46
|
+
end
|
47
|
+
end
|
@@ -3,6 +3,7 @@ require_relative '../url'
|
|
3
3
|
require_relative '../utils'
|
4
4
|
require_relative '../assertable'
|
5
5
|
require_relative 'model'
|
6
|
+
require 'logger'
|
6
7
|
require 'mongo'
|
7
8
|
|
8
9
|
module Wgit
|
@@ -21,17 +22,19 @@ module Wgit
|
|
21
22
|
raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
|
22
23
|
:port, :db, :uname, :pword for a database connection to be established."
|
23
24
|
end
|
24
|
-
|
25
|
-
# Only log
|
26
|
-
Mongo::Logger.logger
|
27
|
-
|
25
|
+
|
26
|
+
# Only log for error (or more severe) scenarios.
|
27
|
+
Mongo::Logger.logger = Wgit.logger.clone
|
28
|
+
Mongo::Logger.logger.progname = 'mongo'
|
29
|
+
Mongo::Logger.logger.level = Logger::ERROR
|
30
|
+
|
28
31
|
address = "#{conn_details[:host]}:#{conn_details[:port]}"
|
29
32
|
@@client = Mongo::Client.new([address],
|
30
|
-
database:
|
31
|
-
user:
|
32
|
-
password:
|
33
|
+
database: conn_details[:db],
|
34
|
+
user: conn_details[:uname],
|
35
|
+
password: conn_details[:pword])
|
33
36
|
end
|
34
|
-
|
37
|
+
|
35
38
|
### Create Data ###
|
36
39
|
|
37
40
|
# Insert one or more Url or Document objects into the DB.
|
data/lib/wgit/document.rb
CHANGED
@@ -24,7 +24,7 @@ module Wgit
|
|
24
24
|
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
|
25
25
|
]
|
26
26
|
|
27
|
-
# The URL of the webpage, an instance of Wgit
|
27
|
+
# The URL of the webpage, an instance of Wgit::Url.
|
28
28
|
attr_reader :url
|
29
29
|
|
30
30
|
# The HTML of the webpage, an instance of String.
|
@@ -56,7 +56,7 @@ module Wgit
|
|
56
56
|
# Init from URL String and HTML String.
|
57
57
|
if url_or_obj.is_a?(String)
|
58
58
|
url = url_or_obj
|
59
|
-
assert_type(url, Url)
|
59
|
+
assert_type(url, Wgit::Url)
|
60
60
|
|
61
61
|
@url = url
|
62
62
|
@html = html ||= ""
|
data/lib/wgit/indexer.rb
CHANGED
@@ -8,8 +8,8 @@ module Wgit
|
|
8
8
|
#
|
9
9
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
10
10
|
# site storing their internal pages into the database and adding their
|
11
|
-
# external url's to be crawled
|
12
|
-
#
|
11
|
+
# external url's to be crawled later on. Logs info on the crawl
|
12
|
+
# using Wgit.logger as it goes along.
|
13
13
|
#
|
14
14
|
# @param max_sites_to_crawl [Integer] The number of separate and whole
|
15
15
|
# websites to be crawled before the method exits. Defaults to -1 which
|
@@ -81,8 +81,8 @@ module Wgit
|
|
81
81
|
|
82
82
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
83
83
|
# site storing their internal pages into the database and adding their
|
84
|
-
# external url's to be crawled
|
85
|
-
#
|
84
|
+
# external url's to be crawled later on. Logs info on the crawl
|
85
|
+
# using Wgit.logger as it goes along.
|
86
86
|
#
|
87
87
|
# @param max_sites_to_crawl [Integer] The number of separate and whole
|
88
88
|
# websites to be crawled before the method exits. Defaults to -1 which
|
@@ -93,28 +93,28 @@ module Wgit
|
|
93
93
|
# that will be obtained.
|
94
94
|
def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
95
95
|
if max_sites_to_crawl < 0
|
96
|
-
|
97
|
-
urls to crawl (which might be never)."
|
96
|
+
Wgit.logger.info("Indexing until the database has been filled or it runs out of \
|
97
|
+
urls to crawl (which might be never).")
|
98
98
|
end
|
99
99
|
site_count = 0
|
100
100
|
|
101
101
|
while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
|
102
|
-
|
102
|
+
Wgit.logger.info("Current database size: #{@db.size}")
|
103
103
|
@crawler.urls = @db.uncrawled_urls
|
104
104
|
|
105
105
|
if @crawler.urls.empty?
|
106
|
-
|
106
|
+
Wgit.logger.info("No urls to crawl, exiting.")
|
107
107
|
return
|
108
108
|
end
|
109
|
-
|
109
|
+
Wgit.logger.info("Starting crawl loop for: #{@crawler.urls}")
|
110
110
|
|
111
111
|
docs_count = 0
|
112
112
|
urls_count = 0
|
113
113
|
|
114
114
|
@crawler.urls.each do |url|
|
115
115
|
unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
|
116
|
-
|
117
|
-
capacity, exiting."
|
116
|
+
Wgit.logger.info("Reached max number of sites to crawl or database \
|
117
|
+
capacity, exiting.")
|
118
118
|
return
|
119
119
|
end
|
120
120
|
site_count += 1
|
@@ -133,20 +133,20 @@ capacity, exiting."
|
|
133
133
|
end
|
134
134
|
|
135
135
|
urls_count += write_urls_to_db(ext_links)
|
136
|
-
|
137
|
-
site: #{url}"
|
136
|
+
Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
|
137
|
+
site: #{url}")
|
138
138
|
end
|
139
139
|
|
140
|
-
|
141
|
-
this iteration."
|
142
|
-
|
143
|
-
iteration."
|
140
|
+
Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) overall for \
|
141
|
+
this iteration.")
|
142
|
+
Wgit.logger.info("Found and saved #{urls_count} external url(s) for the next \
|
143
|
+
iteration.")
|
144
144
|
end
|
145
145
|
end
|
146
146
|
|
147
147
|
# Crawls a single website's pages and stores them into the database.
|
148
148
|
# There is no max download limit so be careful which sites you index.
|
149
|
-
#
|
149
|
+
# Logs info on the crawl using Wgit.logger as it goes along.
|
150
150
|
#
|
151
151
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
152
152
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
@@ -168,7 +168,7 @@ iteration."
|
|
168
168
|
if result
|
169
169
|
if write_doc_to_db(doc)
|
170
170
|
total_pages_indexed += 1
|
171
|
-
|
171
|
+
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
172
172
|
end
|
173
173
|
end
|
174
174
|
end
|
@@ -182,11 +182,11 @@ iteration."
|
|
182
182
|
|
183
183
|
if insert_externals
|
184
184
|
write_urls_to_db(ext_urls)
|
185
|
-
|
185
|
+
Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
|
186
186
|
end
|
187
187
|
|
188
|
-
|
189
|
-
site: #{url}"
|
188
|
+
Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
|
189
|
+
site: #{url}")
|
190
190
|
|
191
191
|
total_pages_indexed
|
192
192
|
end
|
@@ -208,10 +208,10 @@ site: #{url}"
|
|
208
208
|
# inserts.
|
209
209
|
def write_doc_to_db(doc)
|
210
210
|
@db.insert(doc)
|
211
|
-
|
211
|
+
Wgit.logger.info("Saved document for url: #{doc.url}")
|
212
212
|
true
|
213
213
|
rescue Mongo::Error::OperationFailure
|
214
|
-
|
214
|
+
Wgit.logger.info("Document already exists: #{doc.url}")
|
215
215
|
false
|
216
216
|
end
|
217
217
|
|
@@ -223,9 +223,9 @@ site: #{url}"
|
|
223
223
|
begin
|
224
224
|
@db.insert(url)
|
225
225
|
count += 1
|
226
|
-
|
226
|
+
Wgit.logger.info("Inserted url: #{url}")
|
227
227
|
rescue Mongo::Error::OperationFailure
|
228
|
-
|
228
|
+
Wgit.logger.info("Url already exists: #{url}")
|
229
229
|
end
|
230
230
|
end
|
231
231
|
end
|
data/lib/wgit/logger.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# FYI: The default logger is set at the bottom of this file.
|
2
|
+
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
module Wgit
|
6
|
+
# The Logger instance used by Wgit. Set your own custom logger after
|
7
|
+
# requiring this file if needed.
|
8
|
+
@logger = nil
|
9
|
+
|
10
|
+
# Returns the current Logger instance.
|
11
|
+
# @return [Logger] The current Logger instance.
|
12
|
+
def self.logger
|
13
|
+
@logger
|
14
|
+
end
|
15
|
+
|
16
|
+
# Sets the current Logger instance.
|
17
|
+
# @param logger [Logger] The Logger instance to use.
|
18
|
+
# @return [Logger] The current Logger instance having being set.
|
19
|
+
def self.logger=(logger)
|
20
|
+
@logger = logger
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns the default Logger instance.
|
24
|
+
# @return [Logger] The default Logger instance.
|
25
|
+
def self.default_logger
|
26
|
+
Logger.new(STDOUT, progname: 'wgit', level: :info)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Sets the default Logger instance to be used by Wgit.
|
30
|
+
# @return [Logger] The default Logger instance.
|
31
|
+
def self.use_default_logger
|
32
|
+
@logger = self.default_logger
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
Wgit.use_default_logger
|
data/lib/wgit/version.rb
CHANGED
data/lib/wgit.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
require_relative 'wgit/version'
|
2
|
-
require_relative 'wgit/
|
3
|
-
require_relative 'wgit/
|
2
|
+
require_relative 'wgit/logger'
|
3
|
+
require_relative 'wgit/assertable'
|
4
|
+
require_relative 'wgit/utils'
|
4
5
|
require_relative 'wgit/url'
|
5
6
|
require_relative 'wgit/document'
|
6
|
-
require_relative 'wgit/
|
7
|
-
require_relative 'wgit/
|
8
|
-
require_relative 'wgit/database/database'
|
7
|
+
require_relative 'wgit/crawler'
|
8
|
+
require_relative 'wgit/database/connection_details'
|
9
9
|
require_relative 'wgit/database/model'
|
10
|
-
require_relative 'wgit/database/
|
11
|
-
|
10
|
+
require_relative 'wgit/database/database'
|
11
|
+
require_relative 'wgit/indexer'
|
12
|
+
#require_relative 'wgit/core_ext' - Must be explicitly required.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '12.3'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: httplog
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '1.3'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '1.3'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: nokogiri
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,11 +152,12 @@ files:
|
|
138
152
|
- "./lib/wgit/assertable.rb"
|
139
153
|
- "./lib/wgit/core_ext.rb"
|
140
154
|
- "./lib/wgit/crawler.rb"
|
155
|
+
- "./lib/wgit/database/connection_details.rb"
|
141
156
|
- "./lib/wgit/database/database.rb"
|
142
157
|
- "./lib/wgit/database/model.rb"
|
143
|
-
- "./lib/wgit/database/mongo_connection_details.rb"
|
144
158
|
- "./lib/wgit/document.rb"
|
145
159
|
- "./lib/wgit/indexer.rb"
|
160
|
+
- "./lib/wgit/logger.rb"
|
146
161
|
- "./lib/wgit/url.rb"
|
147
162
|
- "./lib/wgit/utils.rb"
|
148
163
|
- "./lib/wgit/version.rb"
|
@@ -1,48 +0,0 @@
|
|
1
|
-
module Wgit
|
2
|
-
# The connection details for the database. This must be set if you want to
|
3
|
-
# store and access webpages in a database. Don't set the constant directly,
|
4
|
-
# instead use the funcs contained within the Wgit module.
|
5
|
-
CONNECTION_DETAILS = {}
|
6
|
-
|
7
|
-
# Set the database's connection details from the given hash and freeze them.
|
8
|
-
# It is your responsibility to ensure the correct hash vars are present and
|
9
|
-
# set. Due to the freezing of the CONNECTION_DETAILS, this func is designed
|
10
|
-
# to be called only once.
|
11
|
-
#
|
12
|
-
# @param hash [Hash] Containing the database connection details to use.
|
13
|
-
# The hash should contain the following keys (of type String):
|
14
|
-
# host, port, uname, pword, db
|
15
|
-
# @raise [KeyError, FrozenError] If any of the required connection
|
16
|
-
# details are missing or if the connection details have already been set.
|
17
|
-
# @return [Hash] Containing the database connection details from hash.
|
18
|
-
def self.set_connection_details(hash)
|
19
|
-
CONNECTION_DETAILS[:host] = hash.fetch('host')
|
20
|
-
CONNECTION_DETAILS[:port] = hash.fetch('port')
|
21
|
-
CONNECTION_DETAILS[:uname] = hash.fetch('uname')
|
22
|
-
CONNECTION_DETAILS[:pword] = hash.fetch('pword')
|
23
|
-
CONNECTION_DETAILS[:db] = hash.fetch('db')
|
24
|
-
|
25
|
-
CONNECTION_DETAILS.freeze
|
26
|
-
end
|
27
|
-
|
28
|
-
# Set the database's connection details from the ENV and freeze them. It is
|
29
|
-
# your responsibility to ensure the correct ENV vars are present and set.
|
30
|
-
# Due to the freezing of the CONNECTION_DETAILS, this func is designed to be
|
31
|
-
# called only once.
|
32
|
-
#
|
33
|
-
# The ENV should contain the following keys (of type String):
|
34
|
-
# DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
|
35
|
-
#
|
36
|
-
# @raise [KeyError, FrozenError] If any of the required connection
|
37
|
-
# details are missing or if the connection details have already been set.
|
38
|
-
# @return [Hash] Containing the database connection details from the ENV.
|
39
|
-
def self.set_connection_details_from_env
|
40
|
-
CONNECTION_DETAILS[:host] = ENV.fetch('DB_HOST')
|
41
|
-
CONNECTION_DETAILS[:port] = ENV.fetch('DB_PORT')
|
42
|
-
CONNECTION_DETAILS[:uname] = ENV.fetch('DB_USERNAME')
|
43
|
-
CONNECTION_DETAILS[:pword] = ENV.fetch('DB_PASSWORD')
|
44
|
-
CONNECTION_DETAILS[:db] = ENV.fetch('DB_DATABASE')
|
45
|
-
|
46
|
-
CONNECTION_DETAILS.freeze
|
47
|
-
end
|
48
|
-
end
|