wgit 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/wgit.rb +1 -1
- data/lib/wgit/assertable.rb +72 -61
- data/lib/wgit/core_ext.rb +11 -5
- data/lib/wgit/crawler.rb +97 -57
- data/lib/wgit/database/database.rb +247 -170
- data/lib/wgit/database/model.rb +40 -24
- data/lib/wgit/database/mongo_connection_details.rb +44 -23
- data/lib/wgit/document.rb +534 -233
- data/lib/wgit/indexer.rb +235 -0
- data/lib/wgit/url.rb +199 -121
- data/lib/wgit/utils.rb +143 -96
- data/lib/wgit/version.rb +5 -1
- metadata +10 -9
- data/lib/wgit/web_crawler.rb +0 -134
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c2ee83f5b722a6aff6fad7727751e149b58de02de3abecd49e51066a1ecf035d
|
4
|
+
data.tar.gz: 112fb3b45192e781d8a4b4eeaca6d3ad7abaaa1a29b8833654b17a9b38644f81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a29f32db3538bed0e2b09ae599623ee9751ed0f929f83b1d7a987982132431cc72b89fa5bc3e2522c50c50c4a7d0fd33aaf0fbc000986b1db8f0494a37cebd7c
|
7
|
+
data.tar.gz: faf0f814dad58fef4a4ec61a5cdd3f4b3ba7170b417e48a18f4a45922efa0dcab8ffaf4bbe0d198d53e5778056599e6827d1315ce68e08984ca2af47e26631bc
|
data/lib/wgit.rb
CHANGED
data/lib/wgit/assertable.rb
CHANGED
@@ -1,69 +1,80 @@
|
|
1
|
-
|
2
1
|
module Wgit
|
3
2
|
|
4
|
-
#
|
5
|
-
#
|
6
|
-
# for asserting the integrity of method definitions etc.
|
3
|
+
# Module containing assert methods including type checking which can be used
|
4
|
+
# for asserting the integrity of method definitions etc.
|
7
5
|
module Assertable
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
# Each object within arr must match one of the types listed in
|
28
|
-
# type_or_types or an exception is thrown using msg if provided.
|
29
|
-
# type_or_types can be a single Class or an Enumerable of Class objects,
|
30
|
-
# Strings and Symbols will not work.
|
31
|
-
def assert_arr_types(arr, type_or_types, msg = nil)
|
32
|
-
raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
|
33
|
-
arr.each do |obj|
|
34
|
-
assert_types(obj, type_or_types, msg)
|
35
|
-
end
|
6
|
+
DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s".freeze
|
7
|
+
WRONG_METHOD_MSG = "arr must be Enumerable, use a different method".freeze
|
8
|
+
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s".freeze
|
9
|
+
|
10
|
+
# Tests if the obj is of a given type.
|
11
|
+
#
|
12
|
+
# @param obj [Object] The Object to test.
|
13
|
+
# @param type_or_types [Type, Array<Type>] The type/types that obj must
|
14
|
+
# belong to or an exception is thrown.
|
15
|
+
# @param msg [String] The raised RuntimeError message, if provided.
|
16
|
+
# @return [Object] The given obj on successful assertion.
|
17
|
+
def assert_types(obj, type_or_types, msg = nil)
|
18
|
+
msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
|
19
|
+
if type_or_types.respond_to?(:any?)
|
20
|
+
match = type_or_types.any? { |type| obj.instance_of?(type) }
|
21
|
+
else
|
22
|
+
match = obj.instance_of?(type_or_types)
|
36
23
|
end
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
24
|
+
raise msg unless match
|
25
|
+
obj
|
26
|
+
end
|
27
|
+
|
28
|
+
# Each object within arr must match one of the types listed in
|
29
|
+
# type_or_types or an exception is raised using msg, if provided.
|
30
|
+
#
|
31
|
+
# @param arr [Enumerable#each] Enumerable of objects to type check.
|
32
|
+
# @param type_or_types [Type, Array<Type>] The allowed type(s).
|
33
|
+
# @param msg [String] The raised RuntimeError message, if provided.
|
34
|
+
# @return [Object] The given arr on successful assertion.
|
35
|
+
def assert_arr_types(arr, type_or_types, msg = nil)
|
36
|
+
raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
|
37
|
+
arr.each do |obj|
|
38
|
+
assert_types(obj, type_or_types, msg)
|
50
39
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
40
|
+
end
|
41
|
+
|
42
|
+
# The obj_or_objs must respond_to? all of the given methods or an
|
43
|
+
# Exception is raised using msg, if provided.
|
44
|
+
#
|
45
|
+
# @param obj_or_objs [Object, Enumerable#each] The objects to duck check.
|
46
|
+
# @param methods [Array<Symbol>] The methods to :respond_to?.
|
47
|
+
# @param msg [String] The raised RuntimeError message, if provided.
|
48
|
+
# @return [Object] The given obj_or_objs on successful assertion.
|
49
|
+
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
50
|
+
methods = [methods] unless methods.respond_to?(:all?)
|
51
|
+
if obj_or_objs.respond_to?(:each)
|
52
|
+
obj_or_objs.each do |obj|
|
53
|
+
_assert_respond_to(obj, methods, msg)
|
54
|
+
end
|
55
|
+
else
|
56
|
+
_assert_respond_to(obj_or_objs, methods, msg)
|
59
57
|
end
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
58
|
+
obj_or_objs
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
# obj must respond_to? all methods or an exception is raised.
|
64
|
+
def _assert_respond_to(obj, methods, msg = nil)
|
65
|
+
raise "methods must respond_to? :all?" unless methods.respond_to?(:all?)
|
66
|
+
msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
|
67
|
+
match = methods.all? { |method| obj.respond_to?(method) }
|
68
|
+
raise msg unless match
|
69
|
+
obj
|
70
|
+
end
|
71
|
+
|
72
|
+
alias :assert_type :assert_types
|
73
|
+
alias :type :assert_types
|
74
|
+
alias :types :assert_types
|
75
|
+
alias :assert_arr_type :assert_arr_types
|
76
|
+
alias :arr_type :assert_arr_types
|
77
|
+
alias :arr_types :assert_arr_types
|
78
|
+
alias :respond_to :assert_respond_to
|
68
79
|
end
|
69
80
|
end
|
data/lib/wgit/core_ext.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
require_relative 'url'
|
2
2
|
|
3
|
-
# @author Michael Telford
|
4
3
|
# Script which extends Ruby's core functionality when parsed.
|
5
|
-
# Needs to be required separately using `require 'wgit/core_ext'`.
|
4
|
+
# Needs to be required separately using `require 'wgit/core_ext'`.
|
6
5
|
|
7
6
|
class String
|
8
|
-
# Converts a String into a Wgit::Url object.
|
7
|
+
# Converts a String into a Wgit::Url object.
|
8
|
+
#
|
9
|
+
# @return [Wgit::Url] The converted URL.
|
9
10
|
def to_url
|
10
11
|
Wgit::Url.new(self)
|
11
12
|
end
|
@@ -13,7 +14,9 @@ end
|
|
13
14
|
|
14
15
|
module Enumerable
|
15
16
|
# Converts each String instance into a Wgit::Url object and returns the new
|
16
|
-
#
|
17
|
+
# Array.
|
18
|
+
#
|
19
|
+
# @return [Array<Wgit::Url>] The converted URL's.
|
17
20
|
def to_urls
|
18
21
|
map do |element|
|
19
22
|
process_url_element(element)
|
@@ -21,7 +24,9 @@ module Enumerable
|
|
21
24
|
end
|
22
25
|
|
23
26
|
# Converts each String instance into a Wgit::Url object and returns the
|
24
|
-
# updated array.
|
27
|
+
# updated array. Modifies the receiver.
|
28
|
+
#
|
29
|
+
# @return [Array<Wgit::Url>] Self containing the converted URL's.
|
25
30
|
def to_urls!
|
26
31
|
map! do |element|
|
27
32
|
process_url_element(element)
|
@@ -31,6 +36,7 @@ end
|
|
31
36
|
|
32
37
|
private
|
33
38
|
|
39
|
+
# Converts the element to a Wgit::Url if the element is a String.
|
34
40
|
def process_url_element(element)
|
35
41
|
if element.is_a? String
|
36
42
|
element.to_url
|
data/lib/wgit/crawler.rb
CHANGED
@@ -3,67 +3,106 @@ require_relative 'document'
|
|
3
3
|
require_relative 'utils'
|
4
4
|
require_relative 'assertable'
|
5
5
|
require 'net/http' # requires 'uri'
|
6
|
-
|
6
|
+
|
7
7
|
module Wgit
|
8
8
|
|
9
|
-
#
|
10
|
-
#
|
11
|
-
# Note that
|
12
|
-
# functionality.
|
9
|
+
# The Crawler class provides a means of crawling web based URL's, turning
|
10
|
+
# their HTML into Wgit::Document's.
|
11
|
+
# Note that currently all redirects will not be followed during a crawl.
|
13
12
|
class Crawler
|
14
13
|
include Assertable
|
15
14
|
|
16
|
-
|
15
|
+
# The urls to crawl.
|
16
|
+
attr_reader :urls
|
17
|
+
|
18
|
+
# The docs of the crawled @urls.
|
19
|
+
attr_reader :docs
|
17
20
|
|
18
|
-
|
19
|
-
|
21
|
+
# Initializes the Crawler by setting the @urls and @docs.
|
22
|
+
#
|
23
|
+
# @param urls [*Wgit::Url] The URLs to crawl.
|
24
|
+
def initialize(*urls)
|
25
|
+
self.[](*urls)
|
20
26
|
@docs = []
|
21
|
-
|
22
|
-
|
27
|
+
end
|
28
|
+
|
29
|
+
# Sets this Crawler's @urls.
|
30
|
+
#
|
31
|
+
# @param urls [Array<Wgit::Url>] The URLs to crawl.
|
23
32
|
def urls=(urls)
|
24
|
-
|
25
|
-
|
33
|
+
@urls = []
|
34
|
+
Wgit::Utils.each(urls) { |url| add_url(url) }
|
26
35
|
end
|
27
36
|
|
37
|
+
# Sets this Crawler's @urls.
|
38
|
+
#
|
39
|
+
# @param urls [*Wgit::Url] The URLs to crawl.
|
28
40
|
def [](*urls)
|
29
|
-
|
41
|
+
# If urls is nil then add_url (when called later) will set @urls = []
|
42
|
+
# so we do nothing here.
|
43
|
+
if not urls.nil?
|
44
|
+
# Due to *urls you can end up with [[url1,url2,url3]] etc. where the
|
45
|
+
# outer array is bogus so we use the inner one only.
|
46
|
+
if urls.is_a?(Enumerable) &&
|
47
|
+
urls.length == 1 &&
|
48
|
+
urls.first.is_a?(Enumerable)
|
49
|
+
urls = urls.first
|
50
|
+
end
|
51
|
+
|
52
|
+
# Here we call urls= method using self because the param name is also
|
53
|
+
# urls which conflicts.
|
54
|
+
self.urls = urls
|
55
|
+
end
|
30
56
|
end
|
31
57
|
|
58
|
+
# Adds the url to this Crawler's @urls.
|
59
|
+
#
|
60
|
+
# @param url [Wgit::Url] A URL to crawl.
|
32
61
|
def <<(url)
|
33
|
-
|
62
|
+
add_url(url)
|
34
63
|
end
|
35
|
-
|
64
|
+
|
36
65
|
# Crawls individual urls, not entire sites.
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
|
66
|
+
#
|
67
|
+
# @param urls [Array<Wgit::Url>] The URLs to crawl.
|
68
|
+
# @yield [doc] If provided, the block is given each crawled
|
69
|
+
# Document. Otherwise each doc is added to @docs which can be accessed
|
70
|
+
# by Crawler#docs after this method returns.
|
71
|
+
# @return [Wgit::Document] The last Document crawled.
|
72
|
+
def crawl_urls(urls = @urls, &block)
|
41
73
|
raise "No urls to crawl" unless urls
|
42
74
|
@docs = []
|
43
75
|
doc = nil
|
44
76
|
Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
|
45
77
|
doc ? doc : @docs.last
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
#
|
50
|
-
#
|
51
|
-
|
52
|
-
|
53
|
-
|
78
|
+
end
|
79
|
+
|
80
|
+
# Crawl the url and return the response document or nil.
|
81
|
+
#
|
82
|
+
# @param url [Wgit::Document] The URL to crawl.
|
83
|
+
# @yield [doc] The crawled HTML Document regardless if the
|
84
|
+
# crawl was successful or not. Therefore, the Document#url can be used.
|
85
|
+
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
86
|
+
# crawl was unsuccessful.
|
87
|
+
def crawl_url(url = @urls.first)
|
88
|
+
assert_type(url, Wgit::Url)
|
89
|
+
markup = fetch(url)
|
54
90
|
url.crawled = true
|
55
91
|
doc = Wgit::Document.new(url, markup)
|
56
|
-
|
92
|
+
yield(doc) if block_given?
|
57
93
|
doc.empty? ? nil : doc
|
58
|
-
|
59
|
-
|
94
|
+
end
|
95
|
+
|
60
96
|
# Crawls an entire site by recursively going through its internal_links.
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
97
|
+
#
|
98
|
+
# @param base_url [Wgit::Url] The base URL of the website to be crawled.
|
99
|
+
# @yield [doc] Given each crawled Document/page of the site.
|
100
|
+
# A block is the only way to interact with each crawled Document.
|
101
|
+
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
102
|
+
# from all of the site's pages or nil if the base_url could not be
|
103
|
+
# crawled successfully.
|
65
104
|
def crawl_site(base_url = @urls.first, &block)
|
66
|
-
assert_type(base_url, Url)
|
105
|
+
assert_type(base_url, Wgit::Url)
|
67
106
|
|
68
107
|
doc = crawl_url(base_url, &block)
|
69
108
|
return nil if doc.nil?
|
@@ -75,7 +114,7 @@ module Wgit
|
|
75
114
|
return doc.external_links.uniq if internal_urls.empty?
|
76
115
|
|
77
116
|
loop do
|
78
|
-
internal_urls.uniq!
|
117
|
+
internal_urls.uniq!
|
79
118
|
|
80
119
|
links = internal_urls - crawled_urls
|
81
120
|
break if links.empty?
|
@@ -94,36 +133,37 @@ module Wgit
|
|
94
133
|
|
95
134
|
private
|
96
135
|
|
97
|
-
# Add the document to the @docs array for later processing
|
98
|
-
#
|
136
|
+
# Add the document to the @docs array for later processing or let the block
|
137
|
+
# process it here and now.
|
99
138
|
def handle_crawl_block(url, &block)
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
139
|
+
if block_given?
|
140
|
+
crawl_url(url, &block)
|
141
|
+
else
|
142
|
+
@docs << crawl_url(url)
|
143
|
+
nil
|
144
|
+
end
|
106
145
|
end
|
107
146
|
|
108
147
|
# The fetch method performs a HTTP GET to obtain the HTML document.
|
109
|
-
# Invalid urls or any HTTP response that doesn't return a HTML body
|
110
|
-
#
|
111
|
-
#
|
148
|
+
# Invalid urls or any HTTP response that doesn't return a HTML body will be
|
149
|
+
# ignored and nil will be returned. This means that redirects etc. will
|
150
|
+
# not be followed.
|
112
151
|
def fetch(url)
|
113
|
-
|
114
|
-
|
115
|
-
|
152
|
+
raise unless url.respond_to?(:to_uri)
|
153
|
+
res = Net::HTTP.get_response(url.to_uri)
|
154
|
+
res.body.empty? ? nil : res.body
|
116
155
|
rescue
|
117
|
-
|
156
|
+
nil
|
118
157
|
end
|
119
158
|
|
159
|
+
# Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
|
120
160
|
def add_url(url)
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
161
|
+
@urls = [] if @urls.nil?
|
162
|
+
if url.is_a?(Wgit::Url)
|
163
|
+
@urls << url
|
164
|
+
else
|
165
|
+
@urls << Wgit::Url.new(url)
|
166
|
+
end
|
127
167
|
end
|
128
168
|
|
129
169
|
alias :crawl :crawl_urls
|
@@ -2,22 +2,19 @@ require_relative '../document'
|
|
2
2
|
require_relative '../url'
|
3
3
|
require_relative '../utils'
|
4
4
|
require_relative '../assertable'
|
5
|
-
require_relative 'mongo_connection_details'
|
6
5
|
require_relative 'model'
|
7
6
|
require 'mongo'
|
8
7
|
|
9
8
|
module Wgit
|
10
9
|
|
11
|
-
# @author Michael Telford
|
12
10
|
# Class modeling a DB connection and CRUD operations for the Url and
|
13
11
|
# Document collections.
|
14
|
-
# The most common methods are: insert, update, urls, search, stats, size.
|
15
12
|
class Database
|
16
13
|
include Assertable
|
17
|
-
|
18
|
-
#
|
19
|
-
|
20
|
-
|
14
|
+
|
15
|
+
# Initializes a database connection client.
|
16
|
+
#
|
17
|
+
# @raise [RuntimeError] If Wgit::CONNECTION_DETAILS aren't set.
|
21
18
|
def initialize
|
22
19
|
conn_details = Wgit::CONNECTION_DETAILS
|
23
20
|
if conn_details.empty?
|
@@ -25,146 +22,188 @@ module Wgit
|
|
25
22
|
:port, :db, :uname, :pword for a database connection to be established."
|
26
23
|
end
|
27
24
|
|
28
|
-
|
25
|
+
# Only log to STDOUT in fatal scenarios.
|
26
|
+
Mongo::Logger.logger.level = Logger::FATAL
|
27
|
+
|
29
28
|
address = "#{conn_details[:host]}:#{conn_details[:port]}"
|
30
29
|
@@client = Mongo::Client.new([address],
|
31
|
-
:
|
32
|
-
:
|
33
|
-
:
|
34
|
-
:logger => logger,
|
35
|
-
:truncate_logs => false)
|
30
|
+
database: conn_details[:db],
|
31
|
+
user: conn_details[:uname],
|
32
|
+
password: conn_details[:pword])
|
36
33
|
end
|
37
34
|
|
38
35
|
### Create Data ###
|
39
36
|
|
37
|
+
# Insert one or more Url or Document objects into the DB.
|
38
|
+
#
|
39
|
+
# @param data [Hash, Enumerable<Hash>] Hash(es) returned from
|
40
|
+
# Wgit::Model.url or Wgit::Model.document.
|
41
|
+
# @raise [RuntimeError] If the data is not valid.
|
40
42
|
def insert(data)
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
else
|
49
|
-
insert_docs(data)
|
50
|
-
end
|
51
|
-
else
|
52
|
-
raise "data is not in the correct format (all Url's or Document's)"
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def insert_urls(url_or_urls)
|
57
|
-
unless url_or_urls.respond_to?(:map)
|
58
|
-
assert_type(url_or_urls, Url)
|
59
|
-
url_or_urls = Wgit::Model.url(url_or_urls)
|
60
|
-
else
|
61
|
-
assert_arr_types(url_or_urls, Url)
|
62
|
-
url_or_urls = url_or_urls.map do |url|
|
63
|
-
Wgit::Model.url(url)
|
64
|
-
end
|
65
|
-
end
|
66
|
-
create(:urls, url_or_urls)
|
67
|
-
end
|
68
|
-
|
69
|
-
def insert_docs(doc_or_docs)
|
70
|
-
unless doc_or_docs.respond_to?(:map)
|
71
|
-
assert_type(doc_or_docs, [Document, Hash])
|
72
|
-
unless doc_or_docs.is_a?(Hash)
|
73
|
-
doc_or_docs = Wgit::Model.document(doc_or_docs)
|
74
|
-
end
|
43
|
+
if data.is_a?(Url)
|
44
|
+
insert_urls(data)
|
45
|
+
elsif data.is_a?(Document)
|
46
|
+
insert_docs(data)
|
47
|
+
elsif data.respond_to?(:first)
|
48
|
+
if data.first.is_a?(Url)
|
49
|
+
insert_urls(data)
|
75
50
|
else
|
76
|
-
|
77
|
-
doc_or_docs = doc_or_docs.map do |doc|
|
78
|
-
Wgit::Model.document(doc) unless doc.is_a?(Hash)
|
79
|
-
end
|
51
|
+
insert_docs(data)
|
80
52
|
end
|
81
|
-
|
53
|
+
else
|
54
|
+
raise "data is not in the correct format (all Url's or Document's)"
|
55
|
+
end
|
82
56
|
end
|
83
57
|
|
84
58
|
### Retrieve Data ###
|
85
59
|
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
#
|
90
|
-
|
91
|
-
|
60
|
+
# Returns Url records from the DB. All Urls are sorted by date_added
|
61
|
+
# ascending, in other words the first url returned is the first one that
|
62
|
+
# was inserted into the DB.
|
63
|
+
#
|
64
|
+
# @param crawled [Boolean] Filter by Url#crawled value. nil returns all.
|
65
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
66
|
+
# @param skip [Integer] Skip n amount of Url's.
|
67
|
+
# @yield [url] Given each Url returned from the DB.
|
68
|
+
# @return [Array<Wgit::Url>] The Urls obtained from the DB.
|
69
|
+
def urls(crawled = nil, limit = 0, skip = 0)
|
70
|
+
crawled.nil? ? query = {} : query = { crawled: crawled }
|
92
71
|
|
93
|
-
sort = { :
|
72
|
+
sort = { date_added: 1 }
|
94
73
|
results = retrieve(:urls, query, sort, {}, limit, skip)
|
95
74
|
return [] if results.count < 1
|
96
75
|
|
97
76
|
# results.respond_to? :map! is false so we use map and overwrite the var.
|
98
77
|
results = results.map { |url_doc| Wgit::Url.new(url_doc) }
|
99
|
-
|
100
|
-
|
78
|
+
results.each { |url| yield(url) } if block_given?
|
79
|
+
|
80
|
+
results
|
101
81
|
end
|
102
82
|
|
83
|
+
# Returns Url records that have been crawled.
|
84
|
+
#
|
85
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
86
|
+
# @param skip [Integer] Skip n amount of Url's.
|
87
|
+
# @yield [url] Given each Url returned from the DB.
|
88
|
+
# @return [Array<Wgit::Url>] The crawled Urls obtained from the DB.
|
103
89
|
def crawled_urls(limit = 0, skip = 0, &block)
|
104
90
|
urls(true, limit, skip, &block)
|
105
91
|
end
|
106
|
-
|
92
|
+
|
93
|
+
# Returned Url records that haven't been crawled. Each Url is yielded to a
|
94
|
+
# block, if given.
|
95
|
+
#
|
96
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
97
|
+
# @param skip [Integer] Skip n amount of Url's.
|
98
|
+
# @yield [url] Given each Url returned from the DB.
|
99
|
+
# @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
|
107
100
|
def uncrawled_urls(limit = 0, skip = 0, &block)
|
108
101
|
urls(false, limit, skip, &block)
|
109
102
|
end
|
110
103
|
|
104
|
+
# Searches against the indexed docs in the DB for the given query.
|
105
|
+
#
|
111
106
|
# Currently all searches are case insensitive.
|
112
107
|
#
|
113
|
-
#
|
114
|
-
# The searched fields are decided by the text index setup against the
|
108
|
+
# The searched fields are decided by the text index setup against the
|
115
109
|
# documents collection. Currently we search against the following fields:
|
116
110
|
# "author", "keywords", "title" and "text".
|
117
111
|
#
|
118
|
-
# The MongoDB search ranks/sorts the results in order (highest first) based
|
119
|
-
# upon each documents textScore which records the number of
|
120
|
-
# then store this textScore in each Document object for use
|
121
|
-
# needed.
|
112
|
+
# The MongoDB search ranks/sorts the results in order (highest first) based
|
113
|
+
# upon each documents textScore which records the number of query hits. We
|
114
|
+
# then store this textScore in each Document result object for use
|
115
|
+
# elsewhere if needed.
|
122
116
|
#
|
123
|
-
# @param
|
124
|
-
# @param whole_sentence [Boolean]
|
125
|
-
#
|
126
|
-
# @param limit [
|
127
|
-
# @param skip [
|
128
|
-
#
|
129
|
-
# @
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
text.strip!
|
134
|
-
text.replace("\"" + text + "\"") if whole_sentence
|
117
|
+
# @param query [String] The text query to search with.
|
118
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
119
|
+
# for separately.
|
120
|
+
# @param limit [Integer] The max number of results to return.
|
121
|
+
# @param skip [Integer] The number of DB records to skip.
|
122
|
+
# @yield [doc] Given each search result (Wgit::Document).
|
123
|
+
# @return [Array<Wgit::Document>] The search results obtained from the DB.
|
124
|
+
def search(query, whole_sentence = false, limit = 10, skip = 0)
|
125
|
+
query.strip!
|
126
|
+
query.replace("\"" + query + "\"") if whole_sentence
|
135
127
|
|
136
|
-
# The
|
137
|
-
# We use the
|
138
|
-
# :$caseSensitive => case_sensitive,
|
139
|
-
sort_proj = { :
|
140
|
-
query = { :$text => { :$search =>
|
128
|
+
# The sort_proj sorts based on the most search hits.
|
129
|
+
# We use the sort_proj hash as both a sort and a projection below.
|
130
|
+
# :$caseSensitive => case_sensitive, 3.2+ only.
|
131
|
+
sort_proj = { score: { :$meta => "textScore" } }
|
132
|
+
query = { :$text => { :$search => query } }
|
133
|
+
|
141
134
|
results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
|
142
|
-
|
143
|
-
|
135
|
+
return [] if results.count < 1 # respond_to? :empty? == false
|
136
|
+
|
144
137
|
# results.respond_to? :map! is false so we use map and overwrite the var.
|
145
138
|
results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
|
146
|
-
|
147
|
-
|
139
|
+
results.each { |doc| yield(doc) } if block_given?
|
140
|
+
|
141
|
+
results
|
148
142
|
end
|
149
143
|
|
150
|
-
#
|
151
|
-
|
152
|
-
|
153
|
-
results = search(text, whole_sentence, limit, skip, &block)
|
154
|
-
Wgit::Utils.printf_search_results(results, text, false, sentence_length)
|
155
|
-
end
|
156
|
-
|
157
|
-
# Returns a Mongo object which can be used like a Hash to retrieve values.
|
144
|
+
# Returns statistics about the database.
|
145
|
+
#
|
146
|
+
# @return [BSON::Document#[]#fetch] Similar to a Hash instance.
|
158
147
|
def stats
|
159
|
-
|
148
|
+
@@client.command(dbStats: 0).documents[0]
|
160
149
|
end
|
161
150
|
|
151
|
+
# Returns the current size of the database.
|
152
|
+
#
|
153
|
+
# @return [Integer] The current size of the DB.
|
162
154
|
def size
|
163
|
-
|
155
|
+
stats[:dataSize]
|
164
156
|
end
|
165
|
-
|
157
|
+
|
158
|
+
# Returns the total number of URL records in the DB.
|
159
|
+
#
|
160
|
+
# @return [Integer] The current number of URL records.
|
161
|
+
def num_urls
|
162
|
+
@@client[:urls].count
|
163
|
+
end
|
164
|
+
|
165
|
+
# Returns the total number of Document records in the DB.
|
166
|
+
#
|
167
|
+
# @return [Integer] The current number of Document records.
|
168
|
+
def num_docs
|
169
|
+
@@client[:documents].count
|
170
|
+
end
|
171
|
+
|
172
|
+
# Returns the total number of records (urls + docs) in the DB.
|
173
|
+
#
|
174
|
+
# @return [Integer] The current number of URL and Document records.
|
175
|
+
def num_records
|
176
|
+
num_urls + num_docs
|
177
|
+
end
|
178
|
+
|
179
|
+
# Returns whether or not a record with the given url (which is unique)
|
180
|
+
# exists in the database's 'urls' collection.
|
181
|
+
#
|
182
|
+
# @param url [Wgit::Url] The Url to search the DB for.
|
183
|
+
# @return [Boolean] True if url exists, otherwise false.
|
184
|
+
def url?(url)
|
185
|
+
h = { "url" => url }
|
186
|
+
not @@client[:urls].find(h).none?
|
187
|
+
end
|
188
|
+
|
189
|
+
# Returns whether or not a record with the given doc.url (which is unique)
|
190
|
+
# exists in the database's 'documents' collection.
|
191
|
+
#
|
192
|
+
# @param doc [Wgit::Document] The Document to search the DB for.
|
193
|
+
# @return [Boolean] True if doc exists, otherwise false.
|
194
|
+
def doc?(doc)
|
195
|
+
url = doc.respond_to?(:url) ? doc.url : doc
|
196
|
+
h = { "url" => url }
|
197
|
+
not @@client[:documents].find(h).none?
|
198
|
+
end
|
199
|
+
|
166
200
|
### Update Data ###
|
167
201
|
|
202
|
+
# Update a Url or Document object in the DB.
|
203
|
+
#
|
204
|
+
# @param data [Hash, Enumerable<Hash>] Hash(es) returned from
|
205
|
+
# Wgit::Model.url or Wgit::Model.document.
|
206
|
+
# @raise [RuntimeError] If the data is not valid.
|
168
207
|
def update(data)
|
169
208
|
if data.is_a?(Url)
|
170
209
|
update_url(data)
|
@@ -174,96 +213,134 @@ module Wgit
|
|
174
213
|
raise "data is not in the correct format (all Url's or Document's)"
|
175
214
|
end
|
176
215
|
end
|
177
|
-
|
178
|
-
def update_url(url)
|
179
|
-
assert_type(url, Url)
|
180
|
-
selection = { :url => url }
|
181
|
-
url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
|
182
|
-
update = { "$set" => url_hash }
|
183
|
-
_update(true, :urls, selection, update)
|
184
|
-
end
|
185
|
-
|
186
|
-
def update_doc(doc)
|
187
|
-
assert_type(doc, Document)
|
188
|
-
selection = { :url => doc.url }
|
189
|
-
doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
|
190
|
-
update = { "$set" => doc_hash }
|
191
|
-
_update(true, :documents, selection, update)
|
192
|
-
end
|
193
|
-
|
194
|
-
private
|
195
216
|
|
217
|
+
private
|
218
|
+
|
219
|
+
# Return if the write to the DB succeeded or not.
|
196
220
|
def write_succeeded?(result, count = 1, multi = false)
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
result.n == count
|
209
|
-
else
|
210
|
-
result.documents.first[:err].nil?
|
211
|
-
end
|
221
|
+
case result.class.to_s
|
222
|
+
# Single create result.
|
223
|
+
when "Mongo::Operation::Insert::Result"
|
224
|
+
result.documents.first[:err].nil?
|
225
|
+
# Multiple create result.
|
226
|
+
when "Mongo::BulkWrite::Result"
|
227
|
+
result.inserted_count == count
|
228
|
+
# Single and multiple update result.
|
229
|
+
when "Mongo::Operation::Update::Result"
|
230
|
+
if multi
|
231
|
+
result.n == count
|
212
232
|
else
|
213
|
-
|
233
|
+
result.documents.first[:err].nil?
|
214
234
|
end
|
235
|
+
# Class no longer used, have you upgraded the 'mongo' gem?
|
236
|
+
else
|
237
|
+
raise "Result class not currently supported: #{result.class.to_s}"
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# Insert one or more Url objects into the DB.
|
242
|
+
def insert_urls(url_or_urls)
|
243
|
+
unless url_or_urls.respond_to?(:map)
|
244
|
+
assert_type(url_or_urls, Url)
|
245
|
+
url_or_urls = Wgit::Model.url(url_or_urls)
|
246
|
+
else
|
247
|
+
assert_arr_types(url_or_urls, Url)
|
248
|
+
url_or_urls = url_or_urls.map do |url|
|
249
|
+
Wgit::Model.url(url)
|
250
|
+
end
|
251
|
+
end
|
252
|
+
create(:urls, url_or_urls)
|
215
253
|
end
|
216
254
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
unless write_succeeded?(result)
|
224
|
-
raise "DB write (insert) failed"
|
225
|
-
end
|
226
|
-
result.n
|
227
|
-
# Multiple docs.
|
228
|
-
elsif data.is_a?(Array)
|
229
|
-
assert_arr_types(data, Hash)
|
230
|
-
data.map! do |data_hash|
|
231
|
-
data_hash.merge(Wgit::Model.common_insert_data)
|
232
|
-
end
|
233
|
-
result = @@client[collection.to_sym].insert_many(data)
|
234
|
-
unless write_succeeded?(result, data.length)
|
235
|
-
raise "DB write(s) failed"
|
236
|
-
end
|
237
|
-
result.inserted_count
|
238
|
-
else
|
239
|
-
raise "data must be a Hash or an Array of Hash's"
|
255
|
+
# Insert one or more Document objects into the DB.
|
256
|
+
def insert_docs(doc_or_docs)
|
257
|
+
unless doc_or_docs.respond_to?(:map)
|
258
|
+
assert_type(doc_or_docs, [Document, Hash])
|
259
|
+
unless doc_or_docs.is_a?(Hash)
|
260
|
+
doc_or_docs = Wgit::Model.document(doc_or_docs)
|
240
261
|
end
|
262
|
+
else
|
263
|
+
assert_arr_types(doc_or_docs, [Document, Hash])
|
264
|
+
doc_or_docs = doc_or_docs.map do |doc|
|
265
|
+
Wgit::Model.document(doc) unless doc.is_a?(Hash)
|
266
|
+
end
|
267
|
+
end
|
268
|
+
create(:documents, doc_or_docs)
|
241
269
|
end
|
242
270
|
|
243
|
-
|
271
|
+
# Create/insert one or more Url or Document records into the DB.
|
272
|
+
def create(collection, data)
|
273
|
+
assert_type(data, [Hash, Array])
|
274
|
+
# Single doc.
|
275
|
+
if data.is_a?(Hash)
|
276
|
+
data.merge!(Wgit::Model.common_insert_data)
|
277
|
+
result = @@client[collection.to_sym].insert_one(data)
|
278
|
+
unless write_succeeded?(result)
|
279
|
+
raise "DB write (insert) failed"
|
280
|
+
end
|
281
|
+
result.n
|
282
|
+
# Multiple docs.
|
283
|
+
elsif data.is_a?(Array)
|
284
|
+
assert_arr_types(data, Hash)
|
285
|
+
data.map! do |data_hash|
|
286
|
+
data_hash.merge(Wgit::Model.common_insert_data)
|
287
|
+
end
|
288
|
+
result = @@client[collection.to_sym].insert_many(data)
|
289
|
+
unless write_succeeded?(result, data.length)
|
290
|
+
raise "DB write(s) failed"
|
291
|
+
end
|
292
|
+
result.inserted_count
|
293
|
+
else
|
294
|
+
raise "data must be a Hash or an Array of Hash's"
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
# Retrieve Url or Document records from the DB.
|
299
|
+
def retrieve(collection, query,
|
300
|
+
sort = {}, projection = {},
|
244
301
|
limit = 0, skip = 0)
|
245
|
-
|
246
|
-
|
247
|
-
|
302
|
+
assert_type(query, Hash)
|
303
|
+
@@client[collection.to_sym].find(query).projection(projection)
|
304
|
+
.skip(skip).limit(limit).sort(sort)
|
305
|
+
end
|
306
|
+
|
307
|
+
# Update a Url object in the DB.
|
308
|
+
def update_url(url)
|
309
|
+
assert_type(url, Url)
|
310
|
+
selection = { url: url }
|
311
|
+
url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
|
312
|
+
update = { "$set" => url_hash }
|
313
|
+
_update(true, :urls, selection, update)
|
314
|
+
end
|
315
|
+
|
316
|
+
# Update a Document object in the DB.
|
317
|
+
def update_doc(doc)
|
318
|
+
assert_type(doc, Document)
|
319
|
+
selection = { url: doc.url }
|
320
|
+
doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
|
321
|
+
update = { "$set" => doc_hash }
|
322
|
+
_update(true, :documents, selection, update)
|
248
323
|
end
|
249
324
|
|
325
|
+
# Update one or more Url or Document records in the DB.
|
250
326
|
# NOTE: The Model.common_update_data should be merged in the calling
|
251
327
|
# method as the update param can be bespoke due to its nature.
|
252
328
|
def _update(single, collection, selection, update)
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
329
|
+
assert_arr_types([selection, update], Hash)
|
330
|
+
if single
|
331
|
+
result = @@client[collection.to_sym].update_one(selection, update)
|
332
|
+
else
|
333
|
+
result = @@client[collection.to_sym].update_many(selection, update)
|
334
|
+
end
|
335
|
+
raise "DB write (update) failed" unless write_succeeded?(result)
|
336
|
+
result.n
|
261
337
|
end
|
262
338
|
|
263
339
|
alias :count :size
|
264
340
|
alias :length :size
|
341
|
+
alias :num_documents :num_docs
|
342
|
+
alias :document? :doc?
|
265
343
|
alias :insert_url :insert_urls
|
266
344
|
alias :insert_doc :insert_docs
|
267
|
-
alias :search_and_format :search_p
|
268
345
|
end
|
269
346
|
end
|