wgit 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 82f33e00a273c6cdeb3ba9c171110d849fff2428
4
- data.tar.gz: 14c63f826d1d21811b14e9f3a2bca750b3f4afa3
2
+ SHA256:
3
+ metadata.gz: c2ee83f5b722a6aff6fad7727751e149b58de02de3abecd49e51066a1ecf035d
4
+ data.tar.gz: 112fb3b45192e781d8a4b4eeaca6d3ad7abaaa1a29b8833654b17a9b38644f81
5
5
  SHA512:
6
- metadata.gz: 7c42b925f72d9e7cceba79d9aee764f97b6537c0005038501a1f75c36b1bcd3b6036cfb9b62fcf01fd435e0348c1e8c00c445a291051c068fa58184de2c9590a
7
- data.tar.gz: a2a756c3be7b9b214921bfdac5846a2250e452265285cb9c3b812d2eaefc2ab969b608cd1841f34507a6ef184f20ba7c98658daf0135fb85eead88de0356320f
6
+ metadata.gz: a29f32db3538bed0e2b09ae599623ee9751ed0f929f83b1d7a987982132431cc72b89fa5bc3e2522c50c50c4a7d0fd33aaf0fbc000986b1db8f0494a37cebd7c
7
+ data.tar.gz: faf0f814dad58fef4a4ec61a5cdd3f4b3ba7170b417e48a18f4a45922efa0dcab8ffaf4bbe0d198d53e5778056599e6827d1315ce68e08984ca2af47e26631bc
@@ -1,6 +1,6 @@
1
1
  require_relative 'wgit/version'
2
2
  require_relative 'wgit/crawler'
3
- require_relative 'wgit/web_crawler'
3
+ require_relative 'wgit/indexer'
4
4
  require_relative 'wgit/url'
5
5
  require_relative 'wgit/document'
6
6
  require_relative 'wgit/utils'
@@ -1,69 +1,80 @@
1
-
2
1
  module Wgit
3
2
 
4
- # @author Michael Telford
5
- # Module containing assert methods including type checking which can be used
6
- # for asserting the integrity of method definitions etc.
3
+ # Module containing assert methods including type checking which can be used
4
+ # for asserting the integrity of method definitions etc.
7
5
  module Assertable
8
- DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
9
- WRONG_METHOD_MSG = "arr must be Enumerable, use a different method"
10
- DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
11
-
12
- # obj.instance_of? must return true for one of the types listed in
13
- # type_or_types or an exception is thrown using msg if provided.
14
- # type_or_types can be a single Class or an Enumerable of Class objects,
15
- # Strings and Symbols will not work.
16
- def assert_types(obj, type_or_types, msg = nil)
17
- msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
18
- if type_or_types.respond_to?(:any?)
19
- match = type_or_types.any? { |type| obj.instance_of?(type) }
20
- else
21
- match = obj.instance_of?(type_or_types)
22
- end
23
- raise msg unless match
24
- obj
25
- end
26
-
27
- # Each object within arr must match one of the types listed in
28
- # type_or_types or an exception is thrown using msg if provided.
29
- # type_or_types can be a single Class or an Enumerable of Class objects,
30
- # Strings and Symbols will not work.
31
- def assert_arr_types(arr, type_or_types, msg = nil)
32
- raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
33
- arr.each do |obj|
34
- assert_types(obj, type_or_types, msg)
35
- end
6
+ DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s".freeze
7
+ WRONG_METHOD_MSG = "arr must be Enumerable, use a different method".freeze
8
+ DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s".freeze
9
+
10
+ # Tests if the obj is of a given type.
11
+ #
12
+ # @param obj [Object] The Object to test.
13
+ # @param type_or_types [Type, Array<Type>] The type/types that obj must
14
+ # belong to or an exception is thrown.
15
+ # @param msg [String] The raised RuntimeError message, if provided.
16
+ # @return [Object] The given obj on successful assertion.
17
+ def assert_types(obj, type_or_types, msg = nil)
18
+ msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
19
+ if type_or_types.respond_to?(:any?)
20
+ match = type_or_types.any? { |type| obj.instance_of?(type) }
21
+ else
22
+ match = obj.instance_of?(type_or_types)
36
23
  end
37
-
38
- # The obj_or_objs must respond_to? all of the given methods or an
39
- # Exception is raised using msg or a default message.
40
- # Returns obj_or_objs on sucessful assertion.
41
- def assert_respond_to(obj_or_objs, methods, msg = nil)
42
- if obj_or_objs.respond_to?(:each)
43
- obj_or_objs.each do |obj|
44
- _assert_respond_to(obj, methods, msg)
45
- end
46
- else
47
- _assert_respond_to(obj_or_objs, methods, msg)
48
- end
49
- obj_or_objs
24
+ raise msg unless match
25
+ obj
26
+ end
27
+
28
+ # Each object within arr must match one of the types listed in
29
+ # type_or_types or an exception is raised using msg, if provided.
30
+ #
31
+ # @param arr [Enumerable#each] Enumerable of objects to type check.
32
+ # @param type_or_types [Type, Array<Type>] The allowed type(s).
33
+ # @param msg [String] The raised RuntimeError message, if provided.
34
+ # @return [Object] The given arr on successful assertion.
35
+ def assert_arr_types(arr, type_or_types, msg = nil)
36
+ raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
37
+ arr.each do |obj|
38
+ assert_types(obj, type_or_types, msg)
50
39
  end
51
-
52
- private
53
-
54
- def _assert_respond_to(obj, methods, msg = nil)
55
- msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
56
- match = methods.all? { |method| obj.respond_to?(method) }
57
- raise msg unless match
58
- obj
40
+ end
41
+
42
+ # The obj_or_objs must respond_to? all of the given methods or an
43
+ # Exception is raised using msg, if provided.
44
+ #
45
+ # @param obj_or_objs [Object, Enumerable#each] The objects to duck check.
46
+ # @param methods [Array<Symbol>] The methods to :respond_to?.
47
+ # @param msg [String] The raised RuntimeError message, if provided.
48
+ # @return [Object] The given obj_or_objs on successful assertion.
49
+ def assert_respond_to(obj_or_objs, methods, msg = nil)
50
+ methods = [methods] unless methods.respond_to?(:all?)
51
+ if obj_or_objs.respond_to?(:each)
52
+ obj_or_objs.each do |obj|
53
+ _assert_respond_to(obj, methods, msg)
54
+ end
55
+ else
56
+ _assert_respond_to(obj_or_objs, methods, msg)
59
57
  end
60
-
61
- alias :assert_type :assert_types
62
- alias :type :assert_types
63
- alias :types :assert_types
64
- alias :assert_arr_type :assert_arr_types
65
- alias :arr_type :assert_arr_types
66
- alias :arr_types :assert_arr_types
67
- alias :respond_to :assert_respond_to
58
+ obj_or_objs
59
+ end
60
+
61
+ private
62
+
63
+ # obj must respond_to? all methods or an exception is raised.
64
+ def _assert_respond_to(obj, methods, msg = nil)
65
+ raise "methods must respond_to? :all?" unless methods.respond_to?(:all?)
66
+ msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
67
+ match = methods.all? { |method| obj.respond_to?(method) }
68
+ raise msg unless match
69
+ obj
70
+ end
71
+
72
+ alias :assert_type :assert_types
73
+ alias :type :assert_types
74
+ alias :types :assert_types
75
+ alias :assert_arr_type :assert_arr_types
76
+ alias :arr_type :assert_arr_types
77
+ alias :arr_types :assert_arr_types
78
+ alias :respond_to :assert_respond_to
68
79
  end
69
80
  end
@@ -1,11 +1,12 @@
1
1
  require_relative 'url'
2
2
 
3
- # @author Michael Telford
4
3
  # Script which extends Ruby's core functionality when parsed.
5
- # Needs to be required separately using `require 'wgit/core_ext'`.
4
+ # Needs to be required separately using `require 'wgit/core_ext'`.
6
5
 
7
6
  class String
8
- # Converts a String into a Wgit::Url object.
7
+ # Converts a String into a Wgit::Url object.
8
+ #
9
+ # @return [Wgit::Url] The converted URL.
9
10
  def to_url
10
11
  Wgit::Url.new(self)
11
12
  end
@@ -13,7 +14,9 @@ end
13
14
 
14
15
  module Enumerable
15
16
  # Converts each String instance into a Wgit::Url object and returns the new
16
- # array.
17
+ # Array.
18
+ #
19
+ # @return [Array<Wgit::Url>] The converted URL's.
17
20
  def to_urls
18
21
  map do |element|
19
22
  process_url_element(element)
@@ -21,7 +24,9 @@ module Enumerable
21
24
  end
22
25
 
23
26
  # Converts each String instance into a Wgit::Url object and returns the
24
- # updated array.
27
+ # updated array. Modifies the receiver.
28
+ #
29
+ # @return [Array<Wgit::Url>] Self containing the converted URL's.
25
30
  def to_urls!
26
31
  map! do |element|
27
32
  process_url_element(element)
@@ -31,6 +36,7 @@ end
31
36
 
32
37
  private
33
38
 
39
+ # Converts the element to a Wgit::Url if the element is a String.
34
40
  def process_url_element(element)
35
41
  if element.is_a? String
36
42
  element.to_url
@@ -3,67 +3,106 @@ require_relative 'document'
3
3
  require_relative 'utils'
4
4
  require_relative 'assertable'
5
5
  require 'net/http' # requires 'uri'
6
-
6
+
7
7
  module Wgit
8
8
 
9
- # @author Michael Telford
10
- # Crawler class provides a means of crawling web URL's.
11
- # Note that any redirects will not be followed for during crawling
12
- # functionality.
9
+ # The Crawler class provides a means of crawling web based URL's, turning
10
+ # their HTML into Wgit::Document's.
11
+ # Note that currently all redirects will not be followed during a crawl.
13
12
  class Crawler
14
13
  include Assertable
15
14
 
16
- attr_reader :urls, :docs
15
+ # The urls to crawl.
16
+ attr_reader :urls
17
+
18
+ # The docs of the crawled @urls.
19
+ attr_reader :docs
17
20
 
18
- def initialize(*urls)
19
- self.urls = urls unless urls.nil?
21
+ # Initializes the Crawler by setting the @urls and @docs.
22
+ #
23
+ # @param urls [*Wgit::Url] The URLs to crawl.
24
+ def initialize(*urls)
25
+ self.[](*urls)
20
26
  @docs = []
21
- end
22
-
27
+ end
28
+
29
+ # Sets this Crawler's @urls.
30
+ #
31
+ # @param urls [Array<Wgit::Url>] The URLs to crawl.
23
32
  def urls=(urls)
24
- @urls = []
25
- Wgit::Utils.each(urls) { |url| add_url(url) }
33
+ @urls = []
34
+ Wgit::Utils.each(urls) { |url| add_url(url) }
26
35
  end
27
36
 
37
+ # Sets this Crawler's @urls.
38
+ #
39
+ # @param urls [*Wgit::Url] The URLs to crawl.
28
40
  def [](*urls)
29
- self.urls = urls unless urls.nil?
41
+ # If urls is nil then add_url (when called later) will set @urls = []
42
+ # so we do nothing here.
43
+ if not urls.nil?
44
+ # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
45
+ # outer array is bogus so we use the inner one only.
46
+ if urls.is_a?(Enumerable) &&
47
+ urls.length == 1 &&
48
+ urls.first.is_a?(Enumerable)
49
+ urls = urls.first
50
+ end
51
+
52
+ # Here we call urls= method using self because the param name is also
53
+ # urls which conflicts.
54
+ self.urls = urls
55
+ end
30
56
  end
31
57
 
58
+ # Adds the url to this Crawler's @urls.
59
+ #
60
+ # @param url [Wgit::Url] A URL to crawl.
32
61
  def <<(url)
33
- add_url(url)
62
+ add_url(url)
34
63
  end
35
-
64
+
36
65
  # Crawls individual urls, not entire sites.
37
- # Returns the last crawled doc.
38
- # Yields each doc to the provided block or adds each doc to @docs
39
- # which can be accessed by Crawler#docs after the method returns.
40
- def crawl_urls(urls = @urls, &block)
66
+ #
67
+ # @param urls [Array<Wgit::Url>] The URLs to crawl.
68
+ # @yield [doc] If provided, the block is given each crawled
69
+ # Document. Otherwise each doc is added to @docs which can be accessed
70
+ # by Crawler#docs after this method returns.
71
+ # @return [Wgit::Document] The last Document crawled.
72
+ def crawl_urls(urls = @urls, &block)
41
73
  raise "No urls to crawl" unless urls
42
74
  @docs = []
43
75
  doc = nil
44
76
  Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
45
77
  doc ? doc : @docs.last
46
- end
47
-
48
- # Crawl the url and return the response document or nil.
49
- # Also yield(doc) if a block is provided. The doc is passed to the block
50
- # regardless of the crawl success so the doc.url can be used if needed.
51
- def crawl_url(url = @urls.first, &block)
52
- assert_type(url, Url)
53
- markup = fetch(url)
78
+ end
79
+
80
+ # Crawl the url and return the response document or nil.
81
+ #
82
+ # @param url [Wgit::Document] The URL to crawl.
83
+ # @yield [doc] The crawled HTML Document regardless if the
84
+ # crawl was successful or not. Therefore, the Document#url can be used.
85
+ # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
86
+ # crawl was unsuccessful.
87
+ def crawl_url(url = @urls.first)
88
+ assert_type(url, Wgit::Url)
89
+ markup = fetch(url)
54
90
  url.crawled = true
55
91
  doc = Wgit::Document.new(url, markup)
56
- block.call(doc) if block_given?
92
+ yield(doc) if block_given?
57
93
  doc.empty? ? nil : doc
58
- end
59
-
94
+ end
95
+
60
96
  # Crawls an entire site by recursively going through its internal_links.
61
- # Also yield(doc) for each crawled doc if a block is provided.
62
- # A block is the only way to interact with the crawled docs.
63
- # Returns a unique array of external urls collected from the site
64
- # or nil if the base_url could not be crawled successfully.
97
+ #
98
+ # @param base_url [Wgit::Url] The base URL of the website to be crawled.
99
+ # @yield [doc] Given each crawled Document/page of the site.
100
+ # A block is the only way to interact with each crawled Document.
101
+ # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
102
+ # from all of the site's pages or nil if the base_url could not be
103
+ # crawled successfully.
65
104
  def crawl_site(base_url = @urls.first, &block)
66
- assert_type(base_url, Url)
105
+ assert_type(base_url, Wgit::Url)
67
106
 
68
107
  doc = crawl_url(base_url, &block)
69
108
  return nil if doc.nil?
@@ -75,7 +114,7 @@ module Wgit
75
114
  return doc.external_links.uniq if internal_urls.empty?
76
115
 
77
116
  loop do
78
- internal_urls.uniq! unless internal_urls.uniq.nil?
117
+ internal_urls.uniq!
79
118
 
80
119
  links = internal_urls - crawled_urls
81
120
  break if links.empty?
@@ -94,36 +133,37 @@ module Wgit
94
133
 
95
134
  private
96
135
 
97
- # Add the document to the @docs array for later processing
98
- # or let the block process it here and now.
136
+ # Add the document to the @docs array for later processing or let the block
137
+ # process it here and now.
99
138
  def handle_crawl_block(url, &block)
100
- if not block_given?
101
- @docs << crawl_url(url)
102
- nil
103
- else
104
- crawl_url(url, &block)
105
- end
139
+ if block_given?
140
+ crawl_url(url, &block)
141
+ else
142
+ @docs << crawl_url(url)
143
+ nil
144
+ end
106
145
  end
107
146
 
108
147
  # The fetch method performs a HTTP GET to obtain the HTML document.
109
- # Invalid urls or any HTTP response that doesn't return a HTML body
110
- # will be ignored and nil will be returned. This means that redirects
111
- # etc. will not be followed.
148
+ # Invalid urls or any HTTP response that doesn't return a HTML body will be
149
+ # ignored and nil will be returned. This means that redirects etc. will
150
+ # not be followed.
112
151
  def fetch(url)
113
- raise unless url.respond_to?(:to_uri)
114
- res = Net::HTTP.get_response(url.to_uri)
115
- res.body.empty? ? nil : res.body
152
+ raise unless url.respond_to?(:to_uri)
153
+ res = Net::HTTP.get_response(url.to_uri)
154
+ res.body.empty? ? nil : res.body
116
155
  rescue
117
- nil
156
+ nil
118
157
  end
119
158
 
159
+ # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
120
160
  def add_url(url)
121
- @urls = [] if @urls.nil?
122
- if url.instance_of?(Url)
123
- @urls << url
124
- else
125
- @urls << Wgit::Url.new(url)
126
- end
161
+ @urls = [] if @urls.nil?
162
+ if url.is_a?(Wgit::Url)
163
+ @urls << url
164
+ else
165
+ @urls << Wgit::Url.new(url)
166
+ end
127
167
  end
128
168
 
129
169
  alias :crawl :crawl_urls
@@ -2,22 +2,19 @@ require_relative '../document'
2
2
  require_relative '../url'
3
3
  require_relative '../utils'
4
4
  require_relative '../assertable'
5
- require_relative 'mongo_connection_details'
6
5
  require_relative 'model'
7
6
  require 'mongo'
8
7
 
9
8
  module Wgit
10
9
 
11
- # @author Michael Telford
12
10
  # Class modeling a DB connection and CRUD operations for the Url and
13
11
  # Document collections.
14
- # The most common methods are: insert, update, urls, search, stats, size.
15
12
  class Database
16
13
  include Assertable
17
-
18
- # Is relative to the root project folder, not this file.
19
- LOG_FILE_PATH = "misc/mongo_log.txt"
20
-
14
+
15
+ # Initializes a database connection client.
16
+ #
17
+ # @raise [RuntimeError] If Wgit::CONNECTION_DETAILS aren't set.
21
18
  def initialize
22
19
  conn_details = Wgit::CONNECTION_DETAILS
23
20
  if conn_details.empty?
@@ -25,146 +22,188 @@ module Wgit
25
22
  :port, :db, :uname, :pword for a database connection to be established."
26
23
  end
27
24
 
28
- logger = Logger.new(LOG_FILE_PATH)
25
+ # Only log to STDOUT in fatal scenarios.
26
+ Mongo::Logger.logger.level = Logger::FATAL
27
+
29
28
  address = "#{conn_details[:host]}:#{conn_details[:port]}"
30
29
  @@client = Mongo::Client.new([address],
31
- :database => conn_details[:db],
32
- :user => conn_details[:uname],
33
- :password => conn_details[:pword],
34
- :logger => logger,
35
- :truncate_logs => false)
30
+ database: conn_details[:db],
31
+ user: conn_details[:uname],
32
+ password: conn_details[:pword])
36
33
  end
37
34
 
38
35
  ### Create Data ###
39
36
 
37
+ # Insert one or more Url or Document objects into the DB.
38
+ #
39
+ # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
40
+ # Wgit::Model.url or Wgit::Model.document.
41
+ # @raise [RuntimeError] If the data is not valid.
40
42
  def insert(data)
41
- if data.is_a?(Url)
42
- insert_urls(data)
43
- elsif data.is_a?(Document)
44
- insert_docs(data)
45
- elsif data.respond_to?(:first)
46
- if data.first.is_a?(Url)
47
- insert_urls(data)
48
- else
49
- insert_docs(data)
50
- end
51
- else
52
- raise "data is not in the correct format (all Url's or Document's)"
53
- end
54
- end
55
-
56
- def insert_urls(url_or_urls)
57
- unless url_or_urls.respond_to?(:map)
58
- assert_type(url_or_urls, Url)
59
- url_or_urls = Wgit::Model.url(url_or_urls)
60
- else
61
- assert_arr_types(url_or_urls, Url)
62
- url_or_urls = url_or_urls.map do |url|
63
- Wgit::Model.url(url)
64
- end
65
- end
66
- create(:urls, url_or_urls)
67
- end
68
-
69
- def insert_docs(doc_or_docs)
70
- unless doc_or_docs.respond_to?(:map)
71
- assert_type(doc_or_docs, [Document, Hash])
72
- unless doc_or_docs.is_a?(Hash)
73
- doc_or_docs = Wgit::Model.document(doc_or_docs)
74
- end
43
+ if data.is_a?(Url)
44
+ insert_urls(data)
45
+ elsif data.is_a?(Document)
46
+ insert_docs(data)
47
+ elsif data.respond_to?(:first)
48
+ if data.first.is_a?(Url)
49
+ insert_urls(data)
75
50
  else
76
- assert_arr_types(doc_or_docs, [Document, Hash])
77
- doc_or_docs = doc_or_docs.map do |doc|
78
- Wgit::Model.document(doc) unless doc.is_a?(Hash)
79
- end
51
+ insert_docs(data)
80
52
  end
81
- create(:documents, doc_or_docs)
53
+ else
54
+ raise "data is not in the correct format (all Url's or Document's)"
55
+ end
82
56
  end
83
57
 
84
58
  ### Retrieve Data ###
85
59
 
86
- # A crawled parameter value of nil (the default) returns all urls.
87
- # A limit of 0 means all urls are returned.
88
- # All urls are sorted by date_added ascending, in other words the first
89
- # url in the results is the first added.
90
- def urls(crawled = nil, limit = 0, skip = 0, &block)
91
- crawled.nil? ? query = {} : query = { :crawled => crawled }
60
+ # Returns Url records from the DB. All Urls are sorted by date_added
61
+ # ascending, in other words the first url returned is the first one that
62
+ # was inserted into the DB.
63
+ #
64
+ # @param crawled [Boolean] Filter by Url#crawled value. nil returns all.
65
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
66
+ # @param skip [Integer] Skip n amount of Url's.
67
+ # @yield [url] Given each Url returned from the DB.
68
+ # @return [Array<Wgit::Url>] The Urls obtained from the DB.
69
+ def urls(crawled = nil, limit = 0, skip = 0)
70
+ crawled.nil? ? query = {} : query = { crawled: crawled }
92
71
 
93
- sort = { :date_added => 1 }
72
+ sort = { date_added: 1 }
94
73
  results = retrieve(:urls, query, sort, {}, limit, skip)
95
74
  return [] if results.count < 1
96
75
 
97
76
  # results.respond_to? :map! is false so we use map and overwrite the var.
98
77
  results = results.map { |url_doc| Wgit::Url.new(url_doc) }
99
- return results unless block_given?
100
- results.each { |url| block.call(url) }
78
+ results.each { |url| yield(url) } if block_given?
79
+
80
+ results
101
81
  end
102
82
 
83
+ # Returns Url records that have been crawled.
84
+ #
85
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
86
+ # @param skip [Integer] Skip n amount of Url's.
87
+ # @yield [url] Given each Url returned from the DB.
88
+ # @return [Array<Wgit::Url>] The crawled Urls obtained from the DB.
103
89
  def crawled_urls(limit = 0, skip = 0, &block)
104
90
  urls(true, limit, skip, &block)
105
91
  end
106
-
92
+
93
+ # Returned Url records that haven't been crawled. Each Url is yielded to a
94
+ # block, if given.
95
+ #
96
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
97
+ # @param skip [Integer] Skip n amount of Url's.
98
+ # @yield [url] Given each Url returned from the DB.
99
+ # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
107
100
  def uncrawled_urls(limit = 0, skip = 0, &block)
108
101
  urls(false, limit, skip, &block)
109
102
  end
110
103
 
104
+ # Searches against the indexed docs in the DB for the given query.
105
+ #
111
106
  # Currently all searches are case insensitive.
112
107
  #
113
- # Searches against the indexed docs in the DB for the given text.
114
- # The searched fields are decided by the text index setup against the
108
+ # The searched fields are decided by the text index setup against the
115
109
  # documents collection. Currently we search against the following fields:
116
110
  # "author", "keywords", "title" and "text".
117
111
  #
118
- # The MongoDB search ranks/sorts the results in order (highest first) based
119
- # upon each documents textScore which records the number of text hits. We
120
- # then store this textScore in each Document object for use elsewhere if
121
- # needed.
112
+ # The MongoDB search ranks/sorts the results in order (highest first) based
113
+ # upon each documents textScore which records the number of query hits. We
114
+ # then store this textScore in each Document result object for use
115
+ # elsewhere if needed.
122
116
  #
123
- # @param text [String] the value to search the data against.
124
- # @param whole_sentence [Boolean] whether multiple words should be
125
- # searched for separately.
126
- # @param limit [Fixnum] the max length/count of the results array.
127
- # @param skip [Fixnum] the number of results to skip, starting with the
128
- # most relevant based upon the textScore of the search.
129
- # @param block [Block] a block which if provided is passed to each result.
130
- #
131
- # @return [Array] of Document objects representing the search results.
132
- def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
133
- text.strip!
134
- text.replace("\"" + text + "\"") if whole_sentence
117
+ # @param query [String] The text query to search with.
118
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
119
+ # for separately.
120
+ # @param limit [Integer] The max number of results to return.
121
+ # @param skip [Integer] The number of DB records to skip.
122
+ # @yield [doc] Given each search result (Wgit::Document).
123
+ # @return [Array<Wgit::Document>] The search results obtained from the DB.
124
+ def search(query, whole_sentence = false, limit = 10, skip = 0)
125
+ query.strip!
126
+ query.replace("\"" + query + "\"") if whole_sentence
135
127
 
136
- # The textScore sorts based on the most search hits.
137
- # We use the textScore hash as a sort and a projection below.
138
- # :$caseSensitive => case_sensitive, # 3.2+ only.
139
- sort_proj = { :score => { :$meta => "textScore" } }
140
- query = { :$text => { :$search => text } }
128
+ # The sort_proj sorts based on the most search hits.
129
+ # We use the sort_proj hash as both a sort and a projection below.
130
+ # :$caseSensitive => case_sensitive, 3.2+ only.
131
+ sort_proj = { score: { :$meta => "textScore" } }
132
+ query = { :$text => { :$search => query } }
133
+
141
134
  results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
142
-
143
- return [] if results.count < 1
135
+ return [] if results.count < 1 # respond_to? :empty? == false
136
+
144
137
  # results.respond_to? :map! is false so we use map and overwrite the var.
145
138
  results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
146
- return results unless block_given?
147
- results.each { |doc| block.call(doc) }
139
+ results.each { |doc| yield(doc) } if block_given?
140
+
141
+ results
148
142
  end
149
143
 
150
- # Performs a search and pretty prints the results.
151
- def search_p(text, whole_sentence = false, limit = 10,
152
- skip = 0, sentence_length = 80, &block)
153
- results = search(text, whole_sentence, limit, skip, &block)
154
- Wgit::Utils.printf_search_results(results, text, false, sentence_length)
155
- end
156
-
157
- # Returns a Mongo object which can be used like a Hash to retrieve values.
144
+ # Returns statistics about the database.
145
+ #
146
+ # @return [BSON::Document#[]#fetch] Similar to a Hash instance.
158
147
  def stats
159
- @@client.command(:dbStats => 0).documents[0]
148
+ @@client.command(dbStats: 0).documents[0]
160
149
  end
161
150
 
151
+ # Returns the current size of the database.
152
+ #
153
+ # @return [Integer] The current size of the DB.
162
154
  def size
163
- stats[:dataSize]
155
+ stats[:dataSize]
164
156
  end
165
-
157
+
158
+ # Returns the total number of URL records in the DB.
159
+ #
160
+ # @return [Integer] The current number of URL records.
161
+ def num_urls
162
+ @@client[:urls].count
163
+ end
164
+
165
+ # Returns the total number of Document records in the DB.
166
+ #
167
+ # @return [Integer] The current number of Document records.
168
+ def num_docs
169
+ @@client[:documents].count
170
+ end
171
+
172
+ # Returns the total number of records (urls + docs) in the DB.
173
+ #
174
+ # @return [Integer] The current number of URL and Document records.
175
+ def num_records
176
+ num_urls + num_docs
177
+ end
178
+
179
+ # Returns whether or not a record with the given url (which is unique)
180
+ # exists in the database's 'urls' collection.
181
+ #
182
+ # @param url [Wgit::Url] The Url to search the DB for.
183
+ # @return [Boolean] True if url exists, otherwise false.
184
+ def url?(url)
185
+ h = { "url" => url }
186
+ not @@client[:urls].find(h).none?
187
+ end
188
+
189
+ # Returns whether or not a record with the given doc.url (which is unique)
190
+ # exists in the database's 'documents' collection.
191
+ #
192
+ # @param doc [Wgit::Document] The Document to search the DB for.
193
+ # @return [Boolean] True if doc exists, otherwise false.
194
+ def doc?(doc)
195
+ url = doc.respond_to?(:url) ? doc.url : doc
196
+ h = { "url" => url }
197
+ not @@client[:documents].find(h).none?
198
+ end
199
+
166
200
  ### Update Data ###
167
201
 
202
+ # Update a Url or Document object in the DB.
203
+ #
204
+ # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
205
+ # Wgit::Model.url or Wgit::Model.document.
206
+ # @raise [RuntimeError] If the data is not valid.
168
207
  def update(data)
169
208
  if data.is_a?(Url)
170
209
  update_url(data)
@@ -174,96 +213,134 @@ module Wgit
174
213
  raise "data is not in the correct format (all Url's or Document's)"
175
214
  end
176
215
  end
177
-
178
- def update_url(url)
179
- assert_type(url, Url)
180
- selection = { :url => url }
181
- url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
182
- update = { "$set" => url_hash }
183
- _update(true, :urls, selection, update)
184
- end
185
-
186
- def update_doc(doc)
187
- assert_type(doc, Document)
188
- selection = { :url => doc.url }
189
- doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
190
- update = { "$set" => doc_hash }
191
- _update(true, :documents, selection, update)
192
- end
193
-
194
- private
195
216
 
217
+ private
218
+
219
+ # Return if the write to the DB succeeded or not.
196
220
  def write_succeeded?(result, count = 1, multi = false)
197
- case result.class.to_s
198
- # Single create result.
199
- when "Mongo::Operation::Write::Insert::Result"
200
- result.documents.first[:err].nil?
201
- # Multiple create result.
202
- when "Mongo::BulkWrite::Result"
203
- result.inserted_count == count
204
- # Single and multiple update result.
205
- when "Mongo::Operation::Write::Update::Result", # MongoDB 3.0
206
- "Mongo::Operation::Write::Update::LegacyResult" # MongoDB 2.4
207
- if multi
208
- result.n == count
209
- else
210
- result.documents.first[:err].nil?
211
- end
221
+ case result.class.to_s
222
+ # Single create result.
223
+ when "Mongo::Operation::Insert::Result"
224
+ result.documents.first[:err].nil?
225
+ # Multiple create result.
226
+ when "Mongo::BulkWrite::Result"
227
+ result.inserted_count == count
228
+ # Single and multiple update result.
229
+ when "Mongo::Operation::Update::Result"
230
+ if multi
231
+ result.n == count
212
232
  else
213
- raise "Result class not currently supported: #{result.class.to_s}"
233
+ result.documents.first[:err].nil?
214
234
  end
235
+ # Class no longer used, have you upgraded the 'mongo' gem?
236
+ else
237
+ raise "Result class not currently supported: #{result.class.to_s}"
238
+ end
239
+ end
240
+
241
+ # Insert one or more Url objects into the DB.
242
+ def insert_urls(url_or_urls)
243
+ unless url_or_urls.respond_to?(:map)
244
+ assert_type(url_or_urls, Url)
245
+ url_or_urls = Wgit::Model.url(url_or_urls)
246
+ else
247
+ assert_arr_types(url_or_urls, Url)
248
+ url_or_urls = url_or_urls.map do |url|
249
+ Wgit::Model.url(url)
250
+ end
251
+ end
252
+ create(:urls, url_or_urls)
215
253
  end
216
254
 
217
- def create(collection, data)
218
- assert_type(data, [Hash, Array])
219
- # Single doc.
220
- if data.is_a?(Hash)
221
- data.merge!(Wgit::Model.common_insert_data)
222
- result = @@client[collection.to_sym].insert_one(data)
223
- unless write_succeeded?(result)
224
- raise "DB write (insert) failed"
225
- end
226
- result.n
227
- # Multiple docs.
228
- elsif data.is_a?(Array)
229
- assert_arr_types(data, Hash)
230
- data.map! do |data_hash|
231
- data_hash.merge(Wgit::Model.common_insert_data)
232
- end
233
- result = @@client[collection.to_sym].insert_many(data)
234
- unless write_succeeded?(result, data.length)
235
- raise "DB write(s) failed"
236
- end
237
- result.inserted_count
238
- else
239
- raise "data must be a Hash or an Array of Hash's"
255
+ # Insert one or more Document objects into the DB.
256
+ def insert_docs(doc_or_docs)
257
+ unless doc_or_docs.respond_to?(:map)
258
+ assert_type(doc_or_docs, [Document, Hash])
259
+ unless doc_or_docs.is_a?(Hash)
260
+ doc_or_docs = Wgit::Model.document(doc_or_docs)
240
261
  end
262
+ else
263
+ assert_arr_types(doc_or_docs, [Document, Hash])
264
+ doc_or_docs = doc_or_docs.map do |doc|
265
+ Wgit::Model.document(doc) unless doc.is_a?(Hash)
266
+ end
267
+ end
268
+ create(:documents, doc_or_docs)
241
269
  end
242
270
 
243
- def retrieve(collection, query, sort = {}, projection = {},
271
+ # Create/insert one or more Url or Document records into the DB.
272
+ def create(collection, data)
273
+ assert_type(data, [Hash, Array])
274
+ # Single doc.
275
+ if data.is_a?(Hash)
276
+ data.merge!(Wgit::Model.common_insert_data)
277
+ result = @@client[collection.to_sym].insert_one(data)
278
+ unless write_succeeded?(result)
279
+ raise "DB write (insert) failed"
280
+ end
281
+ result.n
282
+ # Multiple docs.
283
+ elsif data.is_a?(Array)
284
+ assert_arr_types(data, Hash)
285
+ data.map! do |data_hash|
286
+ data_hash.merge(Wgit::Model.common_insert_data)
287
+ end
288
+ result = @@client[collection.to_sym].insert_many(data)
289
+ unless write_succeeded?(result, data.length)
290
+ raise "DB write(s) failed"
291
+ end
292
+ result.inserted_count
293
+ else
294
+ raise "data must be a Hash or an Array of Hash's"
295
+ end
296
+ end
297
+
298
+ # Retrieve Url or Document records from the DB.
299
+ def retrieve(collection, query,
300
+ sort = {}, projection = {},
244
301
  limit = 0, skip = 0)
245
- assert_type(query, Hash)
246
- @@client[collection.to_sym].find(query).projection(projection)
247
- .skip(skip).limit(limit).sort(sort)
302
+ assert_type(query, Hash)
303
+ @@client[collection.to_sym].find(query).projection(projection)
304
+ .skip(skip).limit(limit).sort(sort)
305
+ end
306
+
307
+ # Update a Url object in the DB.
308
+ def update_url(url)
309
+ assert_type(url, Url)
310
+ selection = { url: url }
311
+ url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
312
+ update = { "$set" => url_hash }
313
+ _update(true, :urls, selection, update)
314
+ end
315
+
316
+ # Update a Document object in the DB.
317
+ def update_doc(doc)
318
+ assert_type(doc, Document)
319
+ selection = { url: doc.url }
320
+ doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
321
+ update = { "$set" => doc_hash }
322
+ _update(true, :documents, selection, update)
248
323
  end
249
324
 
325
+ # Update one or more Url or Document records in the DB.
250
326
  # NOTE: The Model.common_update_data should be merged in the calling
251
327
  # method as the update param can be bespoke due to its nature.
252
328
  def _update(single, collection, selection, update)
253
- assert_arr_types([selection, update], Hash)
254
- if single
255
- result = @@client[collection.to_sym].update_one(selection, update)
256
- else
257
- result = @@client[collection.to_sym].update_many(selection, update)
258
- end
259
- raise "DB write (update) failed" unless write_succeeded?(result)
260
- result.n
329
+ assert_arr_types([selection, update], Hash)
330
+ if single
331
+ result = @@client[collection.to_sym].update_one(selection, update)
332
+ else
333
+ result = @@client[collection.to_sym].update_many(selection, update)
334
+ end
335
+ raise "DB write (update) failed" unless write_succeeded?(result)
336
+ result.n
261
337
  end
262
338
 
263
339
  alias :count :size
264
340
  alias :length :size
341
+ alias :num_documents :num_docs
342
+ alias :document? :doc?
265
343
  alias :insert_url :insert_urls
266
344
  alias :insert_doc :insert_docs
267
- alias :search_and_format :search_p
268
345
  end
269
346
  end