wgit 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wgit/assertable.rb +15 -1
- data/lib/wgit/database/connection_details.rb +47 -0
- data/lib/wgit/database/database.rb +11 -8
- data/lib/wgit/document.rb +2 -2
- data/lib/wgit/indexer.rb +26 -26
- data/lib/wgit/logger.rb +36 -0
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +8 -7
- metadata +17 -2
- data/lib/wgit/database/mongo_connection_details.rb +0 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 81cc82cb5f9b408ca678b7b1731bac5531e72be11186d3c15d36bfbf61ed3838
|
4
|
+
data.tar.gz: b20d7c77389895b7a4dd303cb3a804aa0d1b099a6f98243144f6ae7f04094cd3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c17a87d4bb10f750ea9d19faf16ed0a6de10caeab47d3586a3c2c633ec36a990c238ff9e119763c46868a81c111a4b19abd1cad8f09af7008c89474ff0bbf861
|
7
|
+
data.tar.gz: 3ad0950b664dc872bbc9394e6307974ace5a4ac35aac446aada4d470703360faac28efd911e48895708274581a82cb6c1b2ff1f02116122a2e25f817cd54b5c2
|
data/lib/wgit/assertable.rb
CHANGED
@@ -6,6 +6,7 @@ module Wgit
|
|
6
6
|
DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s".freeze
|
7
7
|
WRONG_METHOD_MSG = "arr must be Enumerable, use a different method".freeze
|
8
8
|
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s".freeze
|
9
|
+
DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not present: %s".freeze
|
9
10
|
|
10
11
|
# Tests if the obj is of a given type.
|
11
12
|
#
|
@@ -57,7 +58,20 @@ module Wgit
|
|
57
58
|
end
|
58
59
|
obj_or_objs
|
59
60
|
end
|
60
|
-
|
61
|
+
|
62
|
+
# The hash must include? the keys or a KeyError is raised.
|
63
|
+
#
|
64
|
+
# @param hash [Hash] The hash which should include the required keys.
|
65
|
+
# @param keys [Array<String, Symbol>] The keys whose presence to assert.
|
66
|
+
# @param msg [String] The raised KeyError message, if provided.
|
67
|
+
# @return [Hash] The given hash on successful assertion.
|
68
|
+
def assert_required_keys(hash, keys, msg = nil)
|
69
|
+
msg ||= DEFAULT_REQUIRED_KEYS_MSG % [keys.join(', ')]
|
70
|
+
all_present = keys.all? { |key| hash.keys.include? key }
|
71
|
+
raise KeyError.new(msg) unless all_present
|
72
|
+
hash
|
73
|
+
end
|
74
|
+
|
61
75
|
private
|
62
76
|
|
63
77
|
# obj must respond_to? all methods or an exception is raised.
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require_relative '../assertable'
|
2
|
+
|
3
|
+
module Wgit
|
4
|
+
extend Assertable
|
5
|
+
|
6
|
+
# The connection details for the database. This must be set if you want to
|
7
|
+
# store and access webpages in a database. Don't set the constant directly,
|
8
|
+
# instead use the funcs contained within the Wgit module.
|
9
|
+
CONNECTION_DETAILS = {}
|
10
|
+
|
11
|
+
# The keys required for a successful database connection.
|
12
|
+
CONNECTION_KEYS_REQUIRED = [
|
13
|
+
'DB_HOST', 'DB_PORT', 'DB_USERNAME', 'DB_PASSWORD', 'DB_DATABASE'
|
14
|
+
]
|
15
|
+
|
16
|
+
# Set the database's connection details from the given hash. It is your
|
17
|
+
# responsibility to ensure the correct hash vars are present and set.
|
18
|
+
#
|
19
|
+
# @param hash [Hash] Containing the database connection details to use.
|
20
|
+
# The hash should contain the following keys (of type String):
|
21
|
+
# DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
|
22
|
+
# @raise [KeyError] If any of the required connection details are missing.
|
23
|
+
# @return [Hash] Containing the database connection details from hash.
|
24
|
+
def self.set_connection_details(hash)
|
25
|
+
assert_required_keys(hash, CONNECTION_KEYS_REQUIRED)
|
26
|
+
|
27
|
+
CONNECTION_DETAILS[:host] = hash.fetch('DB_HOST')
|
28
|
+
CONNECTION_DETAILS[:port] = hash.fetch('DB_PORT')
|
29
|
+
CONNECTION_DETAILS[:uname] = hash.fetch('DB_USERNAME')
|
30
|
+
CONNECTION_DETAILS[:pword] = hash.fetch('DB_PASSWORD')
|
31
|
+
CONNECTION_DETAILS[:db] = hash.fetch('DB_DATABASE')
|
32
|
+
|
33
|
+
CONNECTION_DETAILS
|
34
|
+
end
|
35
|
+
|
36
|
+
# Set the database's connection details from the ENV. It is your
|
37
|
+
# responsibility to ensure the correct ENV vars are present and set.
|
38
|
+
#
|
39
|
+
# The ENV should contain the following keys (of type String):
|
40
|
+
# DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
|
41
|
+
#
|
42
|
+
# @raise [KeyError] If any of the required connection details are missing.
|
43
|
+
# @return [Hash] Containing the database connection details from the ENV.
|
44
|
+
def self.set_connection_details_from_env
|
45
|
+
self.set_connection_details(ENV)
|
46
|
+
end
|
47
|
+
end
|
@@ -3,6 +3,7 @@ require_relative '../url'
|
|
3
3
|
require_relative '../utils'
|
4
4
|
require_relative '../assertable'
|
5
5
|
require_relative 'model'
|
6
|
+
require 'logger'
|
6
7
|
require 'mongo'
|
7
8
|
|
8
9
|
module Wgit
|
@@ -21,17 +22,19 @@ module Wgit
|
|
21
22
|
raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
|
22
23
|
:port, :db, :uname, :pword for a database connection to be established."
|
23
24
|
end
|
24
|
-
|
25
|
-
# Only log
|
26
|
-
Mongo::Logger.logger
|
27
|
-
|
25
|
+
|
26
|
+
# Only log for error (or more severe) scenarios.
|
27
|
+
Mongo::Logger.logger = Wgit.logger.clone
|
28
|
+
Mongo::Logger.logger.progname = 'mongo'
|
29
|
+
Mongo::Logger.logger.level = Logger::ERROR
|
30
|
+
|
28
31
|
address = "#{conn_details[:host]}:#{conn_details[:port]}"
|
29
32
|
@@client = Mongo::Client.new([address],
|
30
|
-
database:
|
31
|
-
user:
|
32
|
-
password:
|
33
|
+
database: conn_details[:db],
|
34
|
+
user: conn_details[:uname],
|
35
|
+
password: conn_details[:pword])
|
33
36
|
end
|
34
|
-
|
37
|
+
|
35
38
|
### Create Data ###
|
36
39
|
|
37
40
|
# Insert one or more Url or Document objects into the DB.
|
data/lib/wgit/document.rb
CHANGED
@@ -24,7 +24,7 @@ module Wgit
|
|
24
24
|
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
|
25
25
|
]
|
26
26
|
|
27
|
-
# The URL of the webpage, an instance of Wgit
|
27
|
+
# The URL of the webpage, an instance of Wgit::Url.
|
28
28
|
attr_reader :url
|
29
29
|
|
30
30
|
# The HTML of the webpage, an instance of String.
|
@@ -56,7 +56,7 @@ module Wgit
|
|
56
56
|
# Init from URL String and HTML String.
|
57
57
|
if url_or_obj.is_a?(String)
|
58
58
|
url = url_or_obj
|
59
|
-
assert_type(url, Url)
|
59
|
+
assert_type(url, Wgit::Url)
|
60
60
|
|
61
61
|
@url = url
|
62
62
|
@html = html ||= ""
|
data/lib/wgit/indexer.rb
CHANGED
@@ -8,8 +8,8 @@ module Wgit
|
|
8
8
|
#
|
9
9
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
10
10
|
# site storing their internal pages into the database and adding their
|
11
|
-
# external url's to be crawled
|
12
|
-
#
|
11
|
+
# external url's to be crawled later on. Logs info on the crawl
|
12
|
+
# using Wgit.logger as it goes along.
|
13
13
|
#
|
14
14
|
# @param max_sites_to_crawl [Integer] The number of separate and whole
|
15
15
|
# websites to be crawled before the method exits. Defaults to -1 which
|
@@ -81,8 +81,8 @@ module Wgit
|
|
81
81
|
|
82
82
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
83
83
|
# site storing their internal pages into the database and adding their
|
84
|
-
# external url's to be crawled
|
85
|
-
#
|
84
|
+
# external url's to be crawled later on. Logs info on the crawl
|
85
|
+
# using Wgit.logger as it goes along.
|
86
86
|
#
|
87
87
|
# @param max_sites_to_crawl [Integer] The number of separate and whole
|
88
88
|
# websites to be crawled before the method exits. Defaults to -1 which
|
@@ -93,28 +93,28 @@ module Wgit
|
|
93
93
|
# that will be obtained.
|
94
94
|
def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
95
95
|
if max_sites_to_crawl < 0
|
96
|
-
|
97
|
-
urls to crawl (which might be never)."
|
96
|
+
Wgit.logger.info("Indexing until the database has been filled or it runs out of \
|
97
|
+
urls to crawl (which might be never).")
|
98
98
|
end
|
99
99
|
site_count = 0
|
100
100
|
|
101
101
|
while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
|
102
|
-
|
102
|
+
Wgit.logger.info("Current database size: #{@db.size}")
|
103
103
|
@crawler.urls = @db.uncrawled_urls
|
104
104
|
|
105
105
|
if @crawler.urls.empty?
|
106
|
-
|
106
|
+
Wgit.logger.info("No urls to crawl, exiting.")
|
107
107
|
return
|
108
108
|
end
|
109
|
-
|
109
|
+
Wgit.logger.info("Starting crawl loop for: #{@crawler.urls}")
|
110
110
|
|
111
111
|
docs_count = 0
|
112
112
|
urls_count = 0
|
113
113
|
|
114
114
|
@crawler.urls.each do |url|
|
115
115
|
unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
|
116
|
-
|
117
|
-
capacity, exiting."
|
116
|
+
Wgit.logger.info("Reached max number of sites to crawl or database \
|
117
|
+
capacity, exiting.")
|
118
118
|
return
|
119
119
|
end
|
120
120
|
site_count += 1
|
@@ -133,20 +133,20 @@ capacity, exiting."
|
|
133
133
|
end
|
134
134
|
|
135
135
|
urls_count += write_urls_to_db(ext_links)
|
136
|
-
|
137
|
-
site: #{url}"
|
136
|
+
Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
|
137
|
+
site: #{url}")
|
138
138
|
end
|
139
139
|
|
140
|
-
|
141
|
-
this iteration."
|
142
|
-
|
143
|
-
iteration."
|
140
|
+
Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) overall for \
|
141
|
+
this iteration.")
|
142
|
+
Wgit.logger.info("Found and saved #{urls_count} external url(s) for the next \
|
143
|
+
iteration.")
|
144
144
|
end
|
145
145
|
end
|
146
146
|
|
147
147
|
# Crawls a single website's pages and stores them into the database.
|
148
148
|
# There is no max download limit so be careful which sites you index.
|
149
|
-
#
|
149
|
+
# Logs info on the crawl using Wgit.logger as it goes along.
|
150
150
|
#
|
151
151
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
152
152
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
@@ -168,7 +168,7 @@ iteration."
|
|
168
168
|
if result
|
169
169
|
if write_doc_to_db(doc)
|
170
170
|
total_pages_indexed += 1
|
171
|
-
|
171
|
+
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
172
172
|
end
|
173
173
|
end
|
174
174
|
end
|
@@ -182,11 +182,11 @@ iteration."
|
|
182
182
|
|
183
183
|
if insert_externals
|
184
184
|
write_urls_to_db(ext_urls)
|
185
|
-
|
185
|
+
Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
|
186
186
|
end
|
187
187
|
|
188
|
-
|
189
|
-
site: #{url}"
|
188
|
+
Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
|
189
|
+
site: #{url}")
|
190
190
|
|
191
191
|
total_pages_indexed
|
192
192
|
end
|
@@ -208,10 +208,10 @@ site: #{url}"
|
|
208
208
|
# inserts.
|
209
209
|
def write_doc_to_db(doc)
|
210
210
|
@db.insert(doc)
|
211
|
-
|
211
|
+
Wgit.logger.info("Saved document for url: #{doc.url}")
|
212
212
|
true
|
213
213
|
rescue Mongo::Error::OperationFailure
|
214
|
-
|
214
|
+
Wgit.logger.info("Document already exists: #{doc.url}")
|
215
215
|
false
|
216
216
|
end
|
217
217
|
|
@@ -223,9 +223,9 @@ site: #{url}"
|
|
223
223
|
begin
|
224
224
|
@db.insert(url)
|
225
225
|
count += 1
|
226
|
-
|
226
|
+
Wgit.logger.info("Inserted url: #{url}")
|
227
227
|
rescue Mongo::Error::OperationFailure
|
228
|
-
|
228
|
+
Wgit.logger.info("Url already exists: #{url}")
|
229
229
|
end
|
230
230
|
end
|
231
231
|
end
|
data/lib/wgit/logger.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# FYI: The default logger is set at the bottom of this file.
|
2
|
+
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
module Wgit
|
6
|
+
# The Logger instance used by Wgit. Set your own custom logger after
|
7
|
+
# requiring this file if needed.
|
8
|
+
@logger = nil
|
9
|
+
|
10
|
+
# Returns the current Logger instance.
|
11
|
+
# @return [Logger] The current Logger instance.
|
12
|
+
def self.logger
|
13
|
+
@logger
|
14
|
+
end
|
15
|
+
|
16
|
+
# Sets the current Logger instance.
|
17
|
+
# @param logger [Logger] The Logger instance to use.
|
18
|
+
# @return [Logger] The current Logger instance having being set.
|
19
|
+
def self.logger=(logger)
|
20
|
+
@logger = logger
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns the default Logger instance.
|
24
|
+
# @return [Logger] The default Logger instance.
|
25
|
+
def self.default_logger
|
26
|
+
Logger.new(STDOUT, progname: 'wgit', level: :info)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Sets the default Logger instance to be used by Wgit.
|
30
|
+
# @return [Logger] The default Logger instance.
|
31
|
+
def self.use_default_logger
|
32
|
+
@logger = self.default_logger
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
Wgit.use_default_logger
|
data/lib/wgit/version.rb
CHANGED
data/lib/wgit.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
require_relative 'wgit/version'
|
2
|
-
require_relative 'wgit/
|
3
|
-
require_relative 'wgit/
|
2
|
+
require_relative 'wgit/logger'
|
3
|
+
require_relative 'wgit/assertable'
|
4
|
+
require_relative 'wgit/utils'
|
4
5
|
require_relative 'wgit/url'
|
5
6
|
require_relative 'wgit/document'
|
6
|
-
require_relative 'wgit/
|
7
|
-
require_relative 'wgit/
|
8
|
-
require_relative 'wgit/database/database'
|
7
|
+
require_relative 'wgit/crawler'
|
8
|
+
require_relative 'wgit/database/connection_details'
|
9
9
|
require_relative 'wgit/database/model'
|
10
|
-
require_relative 'wgit/database/
|
11
|
-
|
10
|
+
require_relative 'wgit/database/database'
|
11
|
+
require_relative 'wgit/indexer'
|
12
|
+
#require_relative 'wgit/core_ext' - Must be explicitly required.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '12.3'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: httplog
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '1.3'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '1.3'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: nokogiri
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,11 +152,12 @@ files:
|
|
138
152
|
- "./lib/wgit/assertable.rb"
|
139
153
|
- "./lib/wgit/core_ext.rb"
|
140
154
|
- "./lib/wgit/crawler.rb"
|
155
|
+
- "./lib/wgit/database/connection_details.rb"
|
141
156
|
- "./lib/wgit/database/database.rb"
|
142
157
|
- "./lib/wgit/database/model.rb"
|
143
|
-
- "./lib/wgit/database/mongo_connection_details.rb"
|
144
158
|
- "./lib/wgit/document.rb"
|
145
159
|
- "./lib/wgit/indexer.rb"
|
160
|
+
- "./lib/wgit/logger.rb"
|
146
161
|
- "./lib/wgit/url.rb"
|
147
162
|
- "./lib/wgit/utils.rb"
|
148
163
|
- "./lib/wgit/version.rb"
|
@@ -1,48 +0,0 @@
|
|
1
|
-
module Wgit
|
2
|
-
# The connection details for the database. This must be set if you want to
|
3
|
-
# store and access webpages in a database. Don't set the constant directly,
|
4
|
-
# instead use the funcs contained within the Wgit module.
|
5
|
-
CONNECTION_DETAILS = {}
|
6
|
-
|
7
|
-
# Set the database's connection details from the given hash and freeze them.
|
8
|
-
# It is your responsibility to ensure the correct hash vars are present and
|
9
|
-
# set. Due to the freezing of the CONNECTION_DETAILS, this func is designed
|
10
|
-
# to be called only once.
|
11
|
-
#
|
12
|
-
# @param hash [Hash] Containing the database connection details to use.
|
13
|
-
# The hash should contain the following keys (of type String):
|
14
|
-
# host, port, uname, pword, db
|
15
|
-
# @raise [KeyError, FrozenError] If any of the required connection
|
16
|
-
# details are missing or if the connection details have already been set.
|
17
|
-
# @return [Hash] Containing the database connection details from hash.
|
18
|
-
def self.set_connection_details(hash)
|
19
|
-
CONNECTION_DETAILS[:host] = hash.fetch('host')
|
20
|
-
CONNECTION_DETAILS[:port] = hash.fetch('port')
|
21
|
-
CONNECTION_DETAILS[:uname] = hash.fetch('uname')
|
22
|
-
CONNECTION_DETAILS[:pword] = hash.fetch('pword')
|
23
|
-
CONNECTION_DETAILS[:db] = hash.fetch('db')
|
24
|
-
|
25
|
-
CONNECTION_DETAILS.freeze
|
26
|
-
end
|
27
|
-
|
28
|
-
# Set the database's connection details from the ENV and freeze them. It is
|
29
|
-
# your responsibility to ensure the correct ENV vars are present and set.
|
30
|
-
# Due to the freezing of the CONNECTION_DETAILS, this func is designed to be
|
31
|
-
# called only once.
|
32
|
-
#
|
33
|
-
# The ENV should contain the following keys (of type String):
|
34
|
-
# DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
|
35
|
-
#
|
36
|
-
# @raise [KeyError, FrozenError] If any of the required connection
|
37
|
-
# details are missing or if the connection details have already been set.
|
38
|
-
# @return [Hash] Containing the database connection details from the ENV.
|
39
|
-
def self.set_connection_details_from_env
|
40
|
-
CONNECTION_DETAILS[:host] = ENV.fetch('DB_HOST')
|
41
|
-
CONNECTION_DETAILS[:port] = ENV.fetch('DB_PORT')
|
42
|
-
CONNECTION_DETAILS[:uname] = ENV.fetch('DB_USERNAME')
|
43
|
-
CONNECTION_DETAILS[:pword] = ENV.fetch('DB_PASSWORD')
|
44
|
-
CONNECTION_DETAILS[:db] = ENV.fetch('DB_DATABASE')
|
45
|
-
|
46
|
-
CONNECTION_DETAILS.freeze
|
47
|
-
end
|
48
|
-
end
|