wgit 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/wgit.rb +1 -1
- data/lib/wgit/assertable.rb +72 -61
- data/lib/wgit/core_ext.rb +11 -5
- data/lib/wgit/crawler.rb +97 -57
- data/lib/wgit/database/database.rb +247 -170
- data/lib/wgit/database/model.rb +40 -24
- data/lib/wgit/database/mongo_connection_details.rb +44 -23
- data/lib/wgit/document.rb +534 -233
- data/lib/wgit/indexer.rb +235 -0
- data/lib/wgit/url.rb +199 -121
- data/lib/wgit/utils.rb +143 -96
- data/lib/wgit/version.rb +5 -1
- metadata +10 -9
- data/lib/wgit/web_crawler.rb +0 -134
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c2ee83f5b722a6aff6fad7727751e149b58de02de3abecd49e51066a1ecf035d
|
4
|
+
data.tar.gz: 112fb3b45192e781d8a4b4eeaca6d3ad7abaaa1a29b8833654b17a9b38644f81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a29f32db3538bed0e2b09ae599623ee9751ed0f929f83b1d7a987982132431cc72b89fa5bc3e2522c50c50c4a7d0fd33aaf0fbc000986b1db8f0494a37cebd7c
|
7
|
+
data.tar.gz: faf0f814dad58fef4a4ec61a5cdd3f4b3ba7170b417e48a18f4a45922efa0dcab8ffaf4bbe0d198d53e5778056599e6827d1315ce68e08984ca2af47e26631bc
|
data/lib/wgit.rb
CHANGED
data/lib/wgit/assertable.rb
CHANGED
@@ -1,69 +1,80 @@
|
|
1
|
-
|
2
1
|
module Wgit
|
3
2
|
|
4
|
-
#
|
5
|
-
#
|
6
|
-
# for asserting the integrity of method definitions etc.
|
3
|
+
# Module containing assert methods including type checking which can be used
|
4
|
+
# for asserting the integrity of method definitions etc.
|
7
5
|
module Assertable
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
# Each object within arr must match one of the types listed in
|
28
|
-
# type_or_types or an exception is thrown using msg if provided.
|
29
|
-
# type_or_types can be a single Class or an Enumerable of Class objects,
|
30
|
-
# Strings and Symbols will not work.
|
31
|
-
def assert_arr_types(arr, type_or_types, msg = nil)
|
32
|
-
raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
|
33
|
-
arr.each do |obj|
|
34
|
-
assert_types(obj, type_or_types, msg)
|
35
|
-
end
|
6
|
+
DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s".freeze
|
7
|
+
WRONG_METHOD_MSG = "arr must be Enumerable, use a different method".freeze
|
8
|
+
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s".freeze
|
9
|
+
|
10
|
+
# Tests if the obj is of a given type.
|
11
|
+
#
|
12
|
+
# @param obj [Object] The Object to test.
|
13
|
+
# @param type_or_types [Type, Array<Type>] The type/types that obj must
|
14
|
+
# belong to or an exception is thrown.
|
15
|
+
# @param msg [String] The raised RuntimeError message, if provided.
|
16
|
+
# @return [Object] The given obj on successful assertion.
|
17
|
+
def assert_types(obj, type_or_types, msg = nil)
|
18
|
+
msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
|
19
|
+
if type_or_types.respond_to?(:any?)
|
20
|
+
match = type_or_types.any? { |type| obj.instance_of?(type) }
|
21
|
+
else
|
22
|
+
match = obj.instance_of?(type_or_types)
|
36
23
|
end
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
24
|
+
raise msg unless match
|
25
|
+
obj
|
26
|
+
end
|
27
|
+
|
28
|
+
# Each object within arr must match one of the types listed in
|
29
|
+
# type_or_types or an exception is raised using msg, if provided.
|
30
|
+
#
|
31
|
+
# @param arr [Enumerable#each] Enumerable of objects to type check.
|
32
|
+
# @param type_or_types [Type, Array<Type>] The allowed type(s).
|
33
|
+
# @param msg [String] The raised RuntimeError message, if provided.
|
34
|
+
# @return [Object] The given arr on successful assertion.
|
35
|
+
def assert_arr_types(arr, type_or_types, msg = nil)
|
36
|
+
raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
|
37
|
+
arr.each do |obj|
|
38
|
+
assert_types(obj, type_or_types, msg)
|
50
39
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
40
|
+
end
|
41
|
+
|
42
|
+
# The obj_or_objs must respond_to? all of the given methods or an
|
43
|
+
# Exception is raised using msg, if provided.
|
44
|
+
#
|
45
|
+
# @param obj_or_objs [Object, Enumerable#each] The objects to duck check.
|
46
|
+
# @param methods [Array<Symbol>] The methods to :respond_to?.
|
47
|
+
# @param msg [String] The raised RuntimeError message, if provided.
|
48
|
+
# @return [Object] The given obj_or_objs on successful assertion.
|
49
|
+
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
50
|
+
methods = [methods] unless methods.respond_to?(:all?)
|
51
|
+
if obj_or_objs.respond_to?(:each)
|
52
|
+
obj_or_objs.each do |obj|
|
53
|
+
_assert_respond_to(obj, methods, msg)
|
54
|
+
end
|
55
|
+
else
|
56
|
+
_assert_respond_to(obj_or_objs, methods, msg)
|
59
57
|
end
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
58
|
+
obj_or_objs
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
# obj must respond_to? all methods or an exception is raised.
|
64
|
+
def _assert_respond_to(obj, methods, msg = nil)
|
65
|
+
raise "methods must respond_to? :all?" unless methods.respond_to?(:all?)
|
66
|
+
msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
|
67
|
+
match = methods.all? { |method| obj.respond_to?(method) }
|
68
|
+
raise msg unless match
|
69
|
+
obj
|
70
|
+
end
|
71
|
+
|
72
|
+
alias :assert_type :assert_types
|
73
|
+
alias :type :assert_types
|
74
|
+
alias :types :assert_types
|
75
|
+
alias :assert_arr_type :assert_arr_types
|
76
|
+
alias :arr_type :assert_arr_types
|
77
|
+
alias :arr_types :assert_arr_types
|
78
|
+
alias :respond_to :assert_respond_to
|
68
79
|
end
|
69
80
|
end
|
data/lib/wgit/core_ext.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
require_relative 'url'
|
2
2
|
|
3
|
-
# @author Michael Telford
|
4
3
|
# Script which extends Ruby's core functionality when parsed.
|
5
|
-
# Needs to be required separately using `require 'wgit/core_ext'`.
|
4
|
+
# Needs to be required separately using `require 'wgit/core_ext'`.
|
6
5
|
|
7
6
|
class String
|
8
|
-
# Converts a String into a Wgit::Url object.
|
7
|
+
# Converts a String into a Wgit::Url object.
|
8
|
+
#
|
9
|
+
# @return [Wgit::Url] The converted URL.
|
9
10
|
def to_url
|
10
11
|
Wgit::Url.new(self)
|
11
12
|
end
|
@@ -13,7 +14,9 @@ end
|
|
13
14
|
|
14
15
|
module Enumerable
|
15
16
|
# Converts each String instance into a Wgit::Url object and returns the new
|
16
|
-
#
|
17
|
+
# Array.
|
18
|
+
#
|
19
|
+
# @return [Array<Wgit::Url>] The converted URL's.
|
17
20
|
def to_urls
|
18
21
|
map do |element|
|
19
22
|
process_url_element(element)
|
@@ -21,7 +24,9 @@ module Enumerable
|
|
21
24
|
end
|
22
25
|
|
23
26
|
# Converts each String instance into a Wgit::Url object and returns the
|
24
|
-
# updated array.
|
27
|
+
# updated array. Modifies the receiver.
|
28
|
+
#
|
29
|
+
# @return [Array<Wgit::Url>] Self containing the converted URL's.
|
25
30
|
def to_urls!
|
26
31
|
map! do |element|
|
27
32
|
process_url_element(element)
|
@@ -31,6 +36,7 @@ end
|
|
31
36
|
|
32
37
|
private
|
33
38
|
|
39
|
+
# Converts the element to a Wgit::Url if the element is a String.
|
34
40
|
def process_url_element(element)
|
35
41
|
if element.is_a? String
|
36
42
|
element.to_url
|
data/lib/wgit/crawler.rb
CHANGED
@@ -3,67 +3,106 @@ require_relative 'document'
|
|
3
3
|
require_relative 'utils'
|
4
4
|
require_relative 'assertable'
|
5
5
|
require 'net/http' # requires 'uri'
|
6
|
-
|
6
|
+
|
7
7
|
module Wgit
|
8
8
|
|
9
|
-
#
|
10
|
-
#
|
11
|
-
# Note that
|
12
|
-
# functionality.
|
9
|
+
# The Crawler class provides a means of crawling web based URL's, turning
|
10
|
+
# their HTML into Wgit::Document's.
|
11
|
+
# Note that currently all redirects will not be followed during a crawl.
|
13
12
|
class Crawler
|
14
13
|
include Assertable
|
15
14
|
|
16
|
-
|
15
|
+
# The urls to crawl.
|
16
|
+
attr_reader :urls
|
17
|
+
|
18
|
+
# The docs of the crawled @urls.
|
19
|
+
attr_reader :docs
|
17
20
|
|
18
|
-
|
19
|
-
|
21
|
+
# Initializes the Crawler by setting the @urls and @docs.
|
22
|
+
#
|
23
|
+
# @param urls [*Wgit::Url] The URLs to crawl.
|
24
|
+
def initialize(*urls)
|
25
|
+
self.[](*urls)
|
20
26
|
@docs = []
|
21
|
-
|
22
|
-
|
27
|
+
end
|
28
|
+
|
29
|
+
# Sets this Crawler's @urls.
|
30
|
+
#
|
31
|
+
# @param urls [Array<Wgit::Url>] The URLs to crawl.
|
23
32
|
def urls=(urls)
|
24
|
-
|
25
|
-
|
33
|
+
@urls = []
|
34
|
+
Wgit::Utils.each(urls) { |url| add_url(url) }
|
26
35
|
end
|
27
36
|
|
37
|
+
# Sets this Crawler's @urls.
|
38
|
+
#
|
39
|
+
# @param urls [*Wgit::Url] The URLs to crawl.
|
28
40
|
def [](*urls)
|
29
|
-
|
41
|
+
# If urls is nil then add_url (when called later) will set @urls = []
|
42
|
+
# so we do nothing here.
|
43
|
+
if not urls.nil?
|
44
|
+
# Due to *urls you can end up with [[url1,url2,url3]] etc. where the
|
45
|
+
# outer array is bogus so we use the inner one only.
|
46
|
+
if urls.is_a?(Enumerable) &&
|
47
|
+
urls.length == 1 &&
|
48
|
+
urls.first.is_a?(Enumerable)
|
49
|
+
urls = urls.first
|
50
|
+
end
|
51
|
+
|
52
|
+
# Here we call urls= method using self because the param name is also
|
53
|
+
# urls which conflicts.
|
54
|
+
self.urls = urls
|
55
|
+
end
|
30
56
|
end
|
31
57
|
|
58
|
+
# Adds the url to this Crawler's @urls.
|
59
|
+
#
|
60
|
+
# @param url [Wgit::Url] A URL to crawl.
|
32
61
|
def <<(url)
|
33
|
-
|
62
|
+
add_url(url)
|
34
63
|
end
|
35
|
-
|
64
|
+
|
36
65
|
# Crawls individual urls, not entire sites.
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
|
66
|
+
#
|
67
|
+
# @param urls [Array<Wgit::Url>] The URLs to crawl.
|
68
|
+
# @yield [doc] If provided, the block is given each crawled
|
69
|
+
# Document. Otherwise each doc is added to @docs which can be accessed
|
70
|
+
# by Crawler#docs after this method returns.
|
71
|
+
# @return [Wgit::Document] The last Document crawled.
|
72
|
+
def crawl_urls(urls = @urls, &block)
|
41
73
|
raise "No urls to crawl" unless urls
|
42
74
|
@docs = []
|
43
75
|
doc = nil
|
44
76
|
Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
|
45
77
|
doc ? doc : @docs.last
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
#
|
50
|
-
#
|
51
|
-
|
52
|
-
|
53
|
-
|
78
|
+
end
|
79
|
+
|
80
|
+
# Crawl the url and return the response document or nil.
|
81
|
+
#
|
82
|
+
# @param url [Wgit::Document] The URL to crawl.
|
83
|
+
# @yield [doc] The crawled HTML Document regardless if the
|
84
|
+
# crawl was successful or not. Therefore, the Document#url can be used.
|
85
|
+
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
86
|
+
# crawl was unsuccessful.
|
87
|
+
def crawl_url(url = @urls.first)
|
88
|
+
assert_type(url, Wgit::Url)
|
89
|
+
markup = fetch(url)
|
54
90
|
url.crawled = true
|
55
91
|
doc = Wgit::Document.new(url, markup)
|
56
|
-
|
92
|
+
yield(doc) if block_given?
|
57
93
|
doc.empty? ? nil : doc
|
58
|
-
|
59
|
-
|
94
|
+
end
|
95
|
+
|
60
96
|
# Crawls an entire site by recursively going through its internal_links.
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
97
|
+
#
|
98
|
+
# @param base_url [Wgit::Url] The base URL of the website to be crawled.
|
99
|
+
# @yield [doc] Given each crawled Document/page of the site.
|
100
|
+
# A block is the only way to interact with each crawled Document.
|
101
|
+
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
102
|
+
# from all of the site's pages or nil if the base_url could not be
|
103
|
+
# crawled successfully.
|
65
104
|
def crawl_site(base_url = @urls.first, &block)
|
66
|
-
assert_type(base_url, Url)
|
105
|
+
assert_type(base_url, Wgit::Url)
|
67
106
|
|
68
107
|
doc = crawl_url(base_url, &block)
|
69
108
|
return nil if doc.nil?
|
@@ -75,7 +114,7 @@ module Wgit
|
|
75
114
|
return doc.external_links.uniq if internal_urls.empty?
|
76
115
|
|
77
116
|
loop do
|
78
|
-
internal_urls.uniq!
|
117
|
+
internal_urls.uniq!
|
79
118
|
|
80
119
|
links = internal_urls - crawled_urls
|
81
120
|
break if links.empty?
|
@@ -94,36 +133,37 @@ module Wgit
|
|
94
133
|
|
95
134
|
private
|
96
135
|
|
97
|
-
# Add the document to the @docs array for later processing
|
98
|
-
#
|
136
|
+
# Add the document to the @docs array for later processing or let the block
|
137
|
+
# process it here and now.
|
99
138
|
def handle_crawl_block(url, &block)
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
139
|
+
if block_given?
|
140
|
+
crawl_url(url, &block)
|
141
|
+
else
|
142
|
+
@docs << crawl_url(url)
|
143
|
+
nil
|
144
|
+
end
|
106
145
|
end
|
107
146
|
|
108
147
|
# The fetch method performs a HTTP GET to obtain the HTML document.
|
109
|
-
# Invalid urls or any HTTP response that doesn't return a HTML body
|
110
|
-
#
|
111
|
-
#
|
148
|
+
# Invalid urls or any HTTP response that doesn't return a HTML body will be
|
149
|
+
# ignored and nil will be returned. This means that redirects etc. will
|
150
|
+
# not be followed.
|
112
151
|
def fetch(url)
|
113
|
-
|
114
|
-
|
115
|
-
|
152
|
+
raise unless url.respond_to?(:to_uri)
|
153
|
+
res = Net::HTTP.get_response(url.to_uri)
|
154
|
+
res.body.empty? ? nil : res.body
|
116
155
|
rescue
|
117
|
-
|
156
|
+
nil
|
118
157
|
end
|
119
158
|
|
159
|
+
# Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
|
120
160
|
def add_url(url)
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
161
|
+
@urls = [] if @urls.nil?
|
162
|
+
if url.is_a?(Wgit::Url)
|
163
|
+
@urls << url
|
164
|
+
else
|
165
|
+
@urls << Wgit::Url.new(url)
|
166
|
+
end
|
127
167
|
end
|
128
168
|
|
129
169
|
alias :crawl :crawl_urls
|
@@ -2,22 +2,19 @@ require_relative '../document'
|
|
2
2
|
require_relative '../url'
|
3
3
|
require_relative '../utils'
|
4
4
|
require_relative '../assertable'
|
5
|
-
require_relative 'mongo_connection_details'
|
6
5
|
require_relative 'model'
|
7
6
|
require 'mongo'
|
8
7
|
|
9
8
|
module Wgit
|
10
9
|
|
11
|
-
# @author Michael Telford
|
12
10
|
# Class modeling a DB connection and CRUD operations for the Url and
|
13
11
|
# Document collections.
|
14
|
-
# The most common methods are: insert, update, urls, search, stats, size.
|
15
12
|
class Database
|
16
13
|
include Assertable
|
17
|
-
|
18
|
-
#
|
19
|
-
|
20
|
-
|
14
|
+
|
15
|
+
# Initializes a database connection client.
|
16
|
+
#
|
17
|
+
# @raise [RuntimeError] If Wgit::CONNECTION_DETAILS aren't set.
|
21
18
|
def initialize
|
22
19
|
conn_details = Wgit::CONNECTION_DETAILS
|
23
20
|
if conn_details.empty?
|
@@ -25,146 +22,188 @@ module Wgit
|
|
25
22
|
:port, :db, :uname, :pword for a database connection to be established."
|
26
23
|
end
|
27
24
|
|
28
|
-
|
25
|
+
# Only log to STDOUT in fatal scenarios.
|
26
|
+
Mongo::Logger.logger.level = Logger::FATAL
|
27
|
+
|
29
28
|
address = "#{conn_details[:host]}:#{conn_details[:port]}"
|
30
29
|
@@client = Mongo::Client.new([address],
|
31
|
-
:
|
32
|
-
:
|
33
|
-
:
|
34
|
-
:logger => logger,
|
35
|
-
:truncate_logs => false)
|
30
|
+
database: conn_details[:db],
|
31
|
+
user: conn_details[:uname],
|
32
|
+
password: conn_details[:pword])
|
36
33
|
end
|
37
34
|
|
38
35
|
### Create Data ###
|
39
36
|
|
37
|
+
# Insert one or more Url or Document objects into the DB.
|
38
|
+
#
|
39
|
+
# @param data [Hash, Enumerable<Hash>] Hash(es) returned from
|
40
|
+
# Wgit::Model.url or Wgit::Model.document.
|
41
|
+
# @raise [RuntimeError] If the data is not valid.
|
40
42
|
def insert(data)
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
else
|
49
|
-
insert_docs(data)
|
50
|
-
end
|
51
|
-
else
|
52
|
-
raise "data is not in the correct format (all Url's or Document's)"
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def insert_urls(url_or_urls)
|
57
|
-
unless url_or_urls.respond_to?(:map)
|
58
|
-
assert_type(url_or_urls, Url)
|
59
|
-
url_or_urls = Wgit::Model.url(url_or_urls)
|
60
|
-
else
|
61
|
-
assert_arr_types(url_or_urls, Url)
|
62
|
-
url_or_urls = url_or_urls.map do |url|
|
63
|
-
Wgit::Model.url(url)
|
64
|
-
end
|
65
|
-
end
|
66
|
-
create(:urls, url_or_urls)
|
67
|
-
end
|
68
|
-
|
69
|
-
def insert_docs(doc_or_docs)
|
70
|
-
unless doc_or_docs.respond_to?(:map)
|
71
|
-
assert_type(doc_or_docs, [Document, Hash])
|
72
|
-
unless doc_or_docs.is_a?(Hash)
|
73
|
-
doc_or_docs = Wgit::Model.document(doc_or_docs)
|
74
|
-
end
|
43
|
+
if data.is_a?(Url)
|
44
|
+
insert_urls(data)
|
45
|
+
elsif data.is_a?(Document)
|
46
|
+
insert_docs(data)
|
47
|
+
elsif data.respond_to?(:first)
|
48
|
+
if data.first.is_a?(Url)
|
49
|
+
insert_urls(data)
|
75
50
|
else
|
76
|
-
|
77
|
-
doc_or_docs = doc_or_docs.map do |doc|
|
78
|
-
Wgit::Model.document(doc) unless doc.is_a?(Hash)
|
79
|
-
end
|
51
|
+
insert_docs(data)
|
80
52
|
end
|
81
|
-
|
53
|
+
else
|
54
|
+
raise "data is not in the correct format (all Url's or Document's)"
|
55
|
+
end
|
82
56
|
end
|
83
57
|
|
84
58
|
### Retrieve Data ###
|
85
59
|
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
#
|
90
|
-
|
91
|
-
|
60
|
+
# Returns Url records from the DB. All Urls are sorted by date_added
|
61
|
+
# ascending, in other words the first url returned is the first one that
|
62
|
+
# was inserted into the DB.
|
63
|
+
#
|
64
|
+
# @param crawled [Boolean] Filter by Url#crawled value. nil returns all.
|
65
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
66
|
+
# @param skip [Integer] Skip n amount of Url's.
|
67
|
+
# @yield [url] Given each Url returned from the DB.
|
68
|
+
# @return [Array<Wgit::Url>] The Urls obtained from the DB.
|
69
|
+
def urls(crawled = nil, limit = 0, skip = 0)
|
70
|
+
crawled.nil? ? query = {} : query = { crawled: crawled }
|
92
71
|
|
93
|
-
sort = { :
|
72
|
+
sort = { date_added: 1 }
|
94
73
|
results = retrieve(:urls, query, sort, {}, limit, skip)
|
95
74
|
return [] if results.count < 1
|
96
75
|
|
97
76
|
# results.respond_to? :map! is false so we use map and overwrite the var.
|
98
77
|
results = results.map { |url_doc| Wgit::Url.new(url_doc) }
|
99
|
-
|
100
|
-
|
78
|
+
results.each { |url| yield(url) } if block_given?
|
79
|
+
|
80
|
+
results
|
101
81
|
end
|
102
82
|
|
83
|
+
# Returns Url records that have been crawled.
|
84
|
+
#
|
85
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
86
|
+
# @param skip [Integer] Skip n amount of Url's.
|
87
|
+
# @yield [url] Given each Url returned from the DB.
|
88
|
+
# @return [Array<Wgit::Url>] The crawled Urls obtained from the DB.
|
103
89
|
def crawled_urls(limit = 0, skip = 0, &block)
|
104
90
|
urls(true, limit, skip, &block)
|
105
91
|
end
|
106
|
-
|
92
|
+
|
93
|
+
# Returned Url records that haven't been crawled. Each Url is yielded to a
|
94
|
+
# block, if given.
|
95
|
+
#
|
96
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
97
|
+
# @param skip [Integer] Skip n amount of Url's.
|
98
|
+
# @yield [url] Given each Url returned from the DB.
|
99
|
+
# @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
|
107
100
|
def uncrawled_urls(limit = 0, skip = 0, &block)
|
108
101
|
urls(false, limit, skip, &block)
|
109
102
|
end
|
110
103
|
|
104
|
+
# Searches against the indexed docs in the DB for the given query.
|
105
|
+
#
|
111
106
|
# Currently all searches are case insensitive.
|
112
107
|
#
|
113
|
-
#
|
114
|
-
# The searched fields are decided by the text index setup against the
|
108
|
+
# The searched fields are decided by the text index setup against the
|
115
109
|
# documents collection. Currently we search against the following fields:
|
116
110
|
# "author", "keywords", "title" and "text".
|
117
111
|
#
|
118
|
-
# The MongoDB search ranks/sorts the results in order (highest first) based
|
119
|
-
# upon each documents textScore which records the number of
|
120
|
-
# then store this textScore in each Document object for use
|
121
|
-
# needed.
|
112
|
+
# The MongoDB search ranks/sorts the results in order (highest first) based
|
113
|
+
# upon each documents textScore which records the number of query hits. We
|
114
|
+
# then store this textScore in each Document result object for use
|
115
|
+
# elsewhere if needed.
|
122
116
|
#
|
123
|
-
# @param
|
124
|
-
# @param whole_sentence [Boolean]
|
125
|
-
#
|
126
|
-
# @param limit [
|
127
|
-
# @param skip [
|
128
|
-
#
|
129
|
-
# @
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
text.strip!
|
134
|
-
text.replace("\"" + text + "\"") if whole_sentence
|
117
|
+
# @param query [String] The text query to search with.
|
118
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
119
|
+
# for separately.
|
120
|
+
# @param limit [Integer] The max number of results to return.
|
121
|
+
# @param skip [Integer] The number of DB records to skip.
|
122
|
+
# @yield [doc] Given each search result (Wgit::Document).
|
123
|
+
# @return [Array<Wgit::Document>] The search results obtained from the DB.
|
124
|
+
def search(query, whole_sentence = false, limit = 10, skip = 0)
|
125
|
+
query.strip!
|
126
|
+
query.replace("\"" + query + "\"") if whole_sentence
|
135
127
|
|
136
|
-
# The
|
137
|
-
# We use the
|
138
|
-
# :$caseSensitive => case_sensitive,
|
139
|
-
sort_proj = { :
|
140
|
-
query = { :$text => { :$search =>
|
128
|
+
# The sort_proj sorts based on the most search hits.
|
129
|
+
# We use the sort_proj hash as both a sort and a projection below.
|
130
|
+
# :$caseSensitive => case_sensitive, 3.2+ only.
|
131
|
+
sort_proj = { score: { :$meta => "textScore" } }
|
132
|
+
query = { :$text => { :$search => query } }
|
133
|
+
|
141
134
|
results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
|
142
|
-
|
143
|
-
|
135
|
+
return [] if results.count < 1 # respond_to? :empty? == false
|
136
|
+
|
144
137
|
# results.respond_to? :map! is false so we use map and overwrite the var.
|
145
138
|
results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
|
146
|
-
|
147
|
-
|
139
|
+
results.each { |doc| yield(doc) } if block_given?
|
140
|
+
|
141
|
+
results
|
148
142
|
end
|
149
143
|
|
150
|
-
#
|
151
|
-
|
152
|
-
|
153
|
-
results = search(text, whole_sentence, limit, skip, &block)
|
154
|
-
Wgit::Utils.printf_search_results(results, text, false, sentence_length)
|
155
|
-
end
|
156
|
-
|
157
|
-
# Returns a Mongo object which can be used like a Hash to retrieve values.
|
144
|
+
# Returns statistics about the database.
|
145
|
+
#
|
146
|
+
# @return [BSON::Document#[]#fetch] Similar to a Hash instance.
|
158
147
|
def stats
|
159
|
-
|
148
|
+
@@client.command(dbStats: 0).documents[0]
|
160
149
|
end
|
161
150
|
|
151
|
+
# Returns the current size of the database.
|
152
|
+
#
|
153
|
+
# @return [Integer] The current size of the DB.
|
162
154
|
def size
|
163
|
-
|
155
|
+
stats[:dataSize]
|
164
156
|
end
|
165
|
-
|
157
|
+
|
158
|
+
# Returns the total number of URL records in the DB.
|
159
|
+
#
|
160
|
+
# @return [Integer] The current number of URL records.
|
161
|
+
def num_urls
|
162
|
+
@@client[:urls].count
|
163
|
+
end
|
164
|
+
|
165
|
+
# Returns the total number of Document records in the DB.
|
166
|
+
#
|
167
|
+
# @return [Integer] The current number of Document records.
|
168
|
+
def num_docs
|
169
|
+
@@client[:documents].count
|
170
|
+
end
|
171
|
+
|
172
|
+
# Returns the total number of records (urls + docs) in the DB.
|
173
|
+
#
|
174
|
+
# @return [Integer] The current number of URL and Document records.
|
175
|
+
def num_records
|
176
|
+
num_urls + num_docs
|
177
|
+
end
|
178
|
+
|
179
|
+
# Returns whether or not a record with the given url (which is unique)
|
180
|
+
# exists in the database's 'urls' collection.
|
181
|
+
#
|
182
|
+
# @param url [Wgit::Url] The Url to search the DB for.
|
183
|
+
# @return [Boolean] True if url exists, otherwise false.
|
184
|
+
def url?(url)
|
185
|
+
h = { "url" => url }
|
186
|
+
not @@client[:urls].find(h).none?
|
187
|
+
end
|
188
|
+
|
189
|
+
# Returns whether or not a record with the given doc.url (which is unique)
|
190
|
+
# exists in the database's 'documents' collection.
|
191
|
+
#
|
192
|
+
# @param doc [Wgit::Document] The Document to search the DB for.
|
193
|
+
# @return [Boolean] True if doc exists, otherwise false.
|
194
|
+
def doc?(doc)
|
195
|
+
url = doc.respond_to?(:url) ? doc.url : doc
|
196
|
+
h = { "url" => url }
|
197
|
+
not @@client[:documents].find(h).none?
|
198
|
+
end
|
199
|
+
|
166
200
|
### Update Data ###
|
167
201
|
|
202
|
+
# Update a Url or Document object in the DB.
|
203
|
+
#
|
204
|
+
# @param data [Hash, Enumerable<Hash>] Hash(es) returned from
|
205
|
+
# Wgit::Model.url or Wgit::Model.document.
|
206
|
+
# @raise [RuntimeError] If the data is not valid.
|
168
207
|
def update(data)
|
169
208
|
if data.is_a?(Url)
|
170
209
|
update_url(data)
|
@@ -174,96 +213,134 @@ module Wgit
|
|
174
213
|
raise "data is not in the correct format (all Url's or Document's)"
|
175
214
|
end
|
176
215
|
end
|
177
|
-
|
178
|
-
def update_url(url)
|
179
|
-
assert_type(url, Url)
|
180
|
-
selection = { :url => url }
|
181
|
-
url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
|
182
|
-
update = { "$set" => url_hash }
|
183
|
-
_update(true, :urls, selection, update)
|
184
|
-
end
|
185
|
-
|
186
|
-
def update_doc(doc)
|
187
|
-
assert_type(doc, Document)
|
188
|
-
selection = { :url => doc.url }
|
189
|
-
doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
|
190
|
-
update = { "$set" => doc_hash }
|
191
|
-
_update(true, :documents, selection, update)
|
192
|
-
end
|
193
|
-
|
194
|
-
private
|
195
216
|
|
217
|
+
private
|
218
|
+
|
219
|
+
# Return if the write to the DB succeeded or not.
|
196
220
|
def write_succeeded?(result, count = 1, multi = false)
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
result.n == count
|
209
|
-
else
|
210
|
-
result.documents.first[:err].nil?
|
211
|
-
end
|
221
|
+
case result.class.to_s
|
222
|
+
# Single create result.
|
223
|
+
when "Mongo::Operation::Insert::Result"
|
224
|
+
result.documents.first[:err].nil?
|
225
|
+
# Multiple create result.
|
226
|
+
when "Mongo::BulkWrite::Result"
|
227
|
+
result.inserted_count == count
|
228
|
+
# Single and multiple update result.
|
229
|
+
when "Mongo::Operation::Update::Result"
|
230
|
+
if multi
|
231
|
+
result.n == count
|
212
232
|
else
|
213
|
-
|
233
|
+
result.documents.first[:err].nil?
|
214
234
|
end
|
235
|
+
# Class no longer used, have you upgraded the 'mongo' gem?
|
236
|
+
else
|
237
|
+
raise "Result class not currently supported: #{result.class.to_s}"
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# Insert one or more Url objects into the DB.
|
242
|
+
def insert_urls(url_or_urls)
|
243
|
+
unless url_or_urls.respond_to?(:map)
|
244
|
+
assert_type(url_or_urls, Url)
|
245
|
+
url_or_urls = Wgit::Model.url(url_or_urls)
|
246
|
+
else
|
247
|
+
assert_arr_types(url_or_urls, Url)
|
248
|
+
url_or_urls = url_or_urls.map do |url|
|
249
|
+
Wgit::Model.url(url)
|
250
|
+
end
|
251
|
+
end
|
252
|
+
create(:urls, url_or_urls)
|
215
253
|
end
|
216
254
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
unless write_succeeded?(result)
|
224
|
-
raise "DB write (insert) failed"
|
225
|
-
end
|
226
|
-
result.n
|
227
|
-
# Multiple docs.
|
228
|
-
elsif data.is_a?(Array)
|
229
|
-
assert_arr_types(data, Hash)
|
230
|
-
data.map! do |data_hash|
|
231
|
-
data_hash.merge(Wgit::Model.common_insert_data)
|
232
|
-
end
|
233
|
-
result = @@client[collection.to_sym].insert_many(data)
|
234
|
-
unless write_succeeded?(result, data.length)
|
235
|
-
raise "DB write(s) failed"
|
236
|
-
end
|
237
|
-
result.inserted_count
|
238
|
-
else
|
239
|
-
raise "data must be a Hash or an Array of Hash's"
|
255
|
+
# Insert one or more Document objects into the DB.
|
256
|
+
def insert_docs(doc_or_docs)
|
257
|
+
unless doc_or_docs.respond_to?(:map)
|
258
|
+
assert_type(doc_or_docs, [Document, Hash])
|
259
|
+
unless doc_or_docs.is_a?(Hash)
|
260
|
+
doc_or_docs = Wgit::Model.document(doc_or_docs)
|
240
261
|
end
|
262
|
+
else
|
263
|
+
assert_arr_types(doc_or_docs, [Document, Hash])
|
264
|
+
doc_or_docs = doc_or_docs.map do |doc|
|
265
|
+
Wgit::Model.document(doc) unless doc.is_a?(Hash)
|
266
|
+
end
|
267
|
+
end
|
268
|
+
create(:documents, doc_or_docs)
|
241
269
|
end
|
242
270
|
|
243
|
-
|
271
|
+
# Create/insert one or more Url or Document records into the DB.
|
272
|
+
def create(collection, data)
|
273
|
+
assert_type(data, [Hash, Array])
|
274
|
+
# Single doc.
|
275
|
+
if data.is_a?(Hash)
|
276
|
+
data.merge!(Wgit::Model.common_insert_data)
|
277
|
+
result = @@client[collection.to_sym].insert_one(data)
|
278
|
+
unless write_succeeded?(result)
|
279
|
+
raise "DB write (insert) failed"
|
280
|
+
end
|
281
|
+
result.n
|
282
|
+
# Multiple docs.
|
283
|
+
elsif data.is_a?(Array)
|
284
|
+
assert_arr_types(data, Hash)
|
285
|
+
data.map! do |data_hash|
|
286
|
+
data_hash.merge(Wgit::Model.common_insert_data)
|
287
|
+
end
|
288
|
+
result = @@client[collection.to_sym].insert_many(data)
|
289
|
+
unless write_succeeded?(result, data.length)
|
290
|
+
raise "DB write(s) failed"
|
291
|
+
end
|
292
|
+
result.inserted_count
|
293
|
+
else
|
294
|
+
raise "data must be a Hash or an Array of Hash's"
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
# Retrieve Url or Document records from the DB.
|
299
|
+
def retrieve(collection, query,
|
300
|
+
sort = {}, projection = {},
|
244
301
|
limit = 0, skip = 0)
|
245
|
-
|
246
|
-
|
247
|
-
|
302
|
+
assert_type(query, Hash)
|
303
|
+
@@client[collection.to_sym].find(query).projection(projection)
|
304
|
+
.skip(skip).limit(limit).sort(sort)
|
305
|
+
end
|
306
|
+
|
307
|
+
# Update a Url object in the DB.
|
308
|
+
def update_url(url)
|
309
|
+
assert_type(url, Url)
|
310
|
+
selection = { url: url }
|
311
|
+
url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
|
312
|
+
update = { "$set" => url_hash }
|
313
|
+
_update(true, :urls, selection, update)
|
314
|
+
end
|
315
|
+
|
316
|
+
# Update a Document object in the DB.
|
317
|
+
def update_doc(doc)
|
318
|
+
assert_type(doc, Document)
|
319
|
+
selection = { url: doc.url }
|
320
|
+
doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
|
321
|
+
update = { "$set" => doc_hash }
|
322
|
+
_update(true, :documents, selection, update)
|
248
323
|
end
|
249
324
|
|
325
|
+
# Update one or more Url or Document records in the DB.
|
250
326
|
# NOTE: The Model.common_update_data should be merged in the calling
|
251
327
|
# method as the update param can be bespoke due to its nature.
|
252
328
|
def _update(single, collection, selection, update)
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
329
|
+
assert_arr_types([selection, update], Hash)
|
330
|
+
if single
|
331
|
+
result = @@client[collection.to_sym].update_one(selection, update)
|
332
|
+
else
|
333
|
+
result = @@client[collection.to_sym].update_many(selection, update)
|
334
|
+
end
|
335
|
+
raise "DB write (update) failed" unless write_succeeded?(result)
|
336
|
+
result.n
|
261
337
|
end
|
262
338
|
|
263
339
|
alias :count :size
|
264
340
|
alias :length :size
|
341
|
+
alias :num_documents :num_docs
|
342
|
+
alias :document? :doc?
|
265
343
|
alias :insert_url :insert_urls
|
266
344
|
alias :insert_doc :insert_docs
|
267
|
-
alias :search_and_format :search_p
|
268
345
|
end
|
269
346
|
end
|