wgit 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 82f33e00a273c6cdeb3ba9c171110d849fff2428
4
- data.tar.gz: 14c63f826d1d21811b14e9f3a2bca750b3f4afa3
2
+ SHA256:
3
+ metadata.gz: c2ee83f5b722a6aff6fad7727751e149b58de02de3abecd49e51066a1ecf035d
4
+ data.tar.gz: 112fb3b45192e781d8a4b4eeaca6d3ad7abaaa1a29b8833654b17a9b38644f81
5
5
  SHA512:
6
- metadata.gz: 7c42b925f72d9e7cceba79d9aee764f97b6537c0005038501a1f75c36b1bcd3b6036cfb9b62fcf01fd435e0348c1e8c00c445a291051c068fa58184de2c9590a
7
- data.tar.gz: a2a756c3be7b9b214921bfdac5846a2250e452265285cb9c3b812d2eaefc2ab969b608cd1841f34507a6ef184f20ba7c98658daf0135fb85eead88de0356320f
6
+ metadata.gz: a29f32db3538bed0e2b09ae599623ee9751ed0f929f83b1d7a987982132431cc72b89fa5bc3e2522c50c50c4a7d0fd33aaf0fbc000986b1db8f0494a37cebd7c
7
+ data.tar.gz: faf0f814dad58fef4a4ec61a5cdd3f4b3ba7170b417e48a18f4a45922efa0dcab8ffaf4bbe0d198d53e5778056599e6827d1315ce68e08984ca2af47e26631bc
@@ -1,6 +1,6 @@
1
1
  require_relative 'wgit/version'
2
2
  require_relative 'wgit/crawler'
3
- require_relative 'wgit/web_crawler'
3
+ require_relative 'wgit/indexer'
4
4
  require_relative 'wgit/url'
5
5
  require_relative 'wgit/document'
6
6
  require_relative 'wgit/utils'
@@ -1,69 +1,80 @@
1
-
2
1
  module Wgit
3
2
 
4
- # @author Michael Telford
5
- # Module containing assert methods including type checking which can be used
6
- # for asserting the integrity of method definitions etc.
3
+ # Module containing assert methods including type checking which can be used
4
+ # for asserting the integrity of method definitions etc.
7
5
  module Assertable
8
- DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
9
- WRONG_METHOD_MSG = "arr must be Enumerable, use a different method"
10
- DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
11
-
12
- # obj.instance_of? must return true for one of the types listed in
13
- # type_or_types or an exception is thrown using msg if provided.
14
- # type_or_types can be a single Class or an Enumerable of Class objects,
15
- # Strings and Symbols will not work.
16
- def assert_types(obj, type_or_types, msg = nil)
17
- msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
18
- if type_or_types.respond_to?(:any?)
19
- match = type_or_types.any? { |type| obj.instance_of?(type) }
20
- else
21
- match = obj.instance_of?(type_or_types)
22
- end
23
- raise msg unless match
24
- obj
25
- end
26
-
27
- # Each object within arr must match one of the types listed in
28
- # type_or_types or an exception is thrown using msg if provided.
29
- # type_or_types can be a single Class or an Enumerable of Class objects,
30
- # Strings and Symbols will not work.
31
- def assert_arr_types(arr, type_or_types, msg = nil)
32
- raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
33
- arr.each do |obj|
34
- assert_types(obj, type_or_types, msg)
35
- end
6
+ DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s".freeze
7
+ WRONG_METHOD_MSG = "arr must be Enumerable, use a different method".freeze
8
+ DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s".freeze
9
+
10
+ # Tests if the obj is of a given type.
11
+ #
12
+ # @param obj [Object] The Object to test.
13
+ # @param type_or_types [Type, Array<Type>] The type/types that obj must
14
+ # belong to or an exception is thrown.
15
+ # @param msg [String] The raised RuntimeError message, if provided.
16
+ # @return [Object] The given obj on successful assertion.
17
+ def assert_types(obj, type_or_types, msg = nil)
18
+ msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
19
+ if type_or_types.respond_to?(:any?)
20
+ match = type_or_types.any? { |type| obj.instance_of?(type) }
21
+ else
22
+ match = obj.instance_of?(type_or_types)
36
23
  end
37
-
38
- # The obj_or_objs must respond_to? all of the given methods or an
39
- # Exception is raised using msg or a default message.
40
- # Returns obj_or_objs on sucessful assertion.
41
- def assert_respond_to(obj_or_objs, methods, msg = nil)
42
- if obj_or_objs.respond_to?(:each)
43
- obj_or_objs.each do |obj|
44
- _assert_respond_to(obj, methods, msg)
45
- end
46
- else
47
- _assert_respond_to(obj_or_objs, methods, msg)
48
- end
49
- obj_or_objs
24
+ raise msg unless match
25
+ obj
26
+ end
27
+
28
+ # Each object within arr must match one of the types listed in
29
+ # type_or_types or an exception is raised using msg, if provided.
30
+ #
31
+ # @param arr [Enumerable#each] Enumerable of objects to type check.
32
+ # @param type_or_types [Type, Array<Type>] The allowed type(s).
33
+ # @param msg [String] The raised RuntimeError message, if provided.
34
+ # @return [Object] The given arr on successful assertion.
35
+ def assert_arr_types(arr, type_or_types, msg = nil)
36
+ raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
37
+ arr.each do |obj|
38
+ assert_types(obj, type_or_types, msg)
50
39
  end
51
-
52
- private
53
-
54
- def _assert_respond_to(obj, methods, msg = nil)
55
- msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
56
- match = methods.all? { |method| obj.respond_to?(method) }
57
- raise msg unless match
58
- obj
40
+ end
41
+
42
+ # The obj_or_objs must respond_to? all of the given methods or an
43
+ # Exception is raised using msg, if provided.
44
+ #
45
+ # @param obj_or_objs [Object, Enumerable#each] The objects to duck check.
46
+ # @param methods [Array<Symbol>] The methods to :respond_to?.
47
+ # @param msg [String] The raised RuntimeError message, if provided.
48
+ # @return [Object] The given obj_or_objs on successful assertion.
49
+ def assert_respond_to(obj_or_objs, methods, msg = nil)
50
+ methods = [methods] unless methods.respond_to?(:all?)
51
+ if obj_or_objs.respond_to?(:each)
52
+ obj_or_objs.each do |obj|
53
+ _assert_respond_to(obj, methods, msg)
54
+ end
55
+ else
56
+ _assert_respond_to(obj_or_objs, methods, msg)
59
57
  end
60
-
61
- alias :assert_type :assert_types
62
- alias :type :assert_types
63
- alias :types :assert_types
64
- alias :assert_arr_type :assert_arr_types
65
- alias :arr_type :assert_arr_types
66
- alias :arr_types :assert_arr_types
67
- alias :respond_to :assert_respond_to
58
+ obj_or_objs
59
+ end
60
+
61
+ private
62
+
63
+ # obj must respond_to? all methods or an exception is raised.
64
+ def _assert_respond_to(obj, methods, msg = nil)
65
+ raise "methods must respond_to? :all?" unless methods.respond_to?(:all?)
66
+ msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
67
+ match = methods.all? { |method| obj.respond_to?(method) }
68
+ raise msg unless match
69
+ obj
70
+ end
71
+
72
+ alias :assert_type :assert_types
73
+ alias :type :assert_types
74
+ alias :types :assert_types
75
+ alias :assert_arr_type :assert_arr_types
76
+ alias :arr_type :assert_arr_types
77
+ alias :arr_types :assert_arr_types
78
+ alias :respond_to :assert_respond_to
68
79
  end
69
80
  end
@@ -1,11 +1,12 @@
1
1
  require_relative 'url'
2
2
 
3
- # @author Michael Telford
4
3
  # Script which extends Ruby's core functionality when parsed.
5
- # Needs to be required separately using `require 'wgit/core_ext'`.
4
+ # Needs to be required separately using `require 'wgit/core_ext'`.
6
5
 
7
6
  class String
8
- # Converts a String into a Wgit::Url object.
7
+ # Converts a String into a Wgit::Url object.
8
+ #
9
+ # @return [Wgit::Url] The converted URL.
9
10
  def to_url
10
11
  Wgit::Url.new(self)
11
12
  end
@@ -13,7 +14,9 @@ end
13
14
 
14
15
  module Enumerable
15
16
  # Converts each String instance into a Wgit::Url object and returns the new
16
- # array.
17
+ # Array.
18
+ #
19
+ # @return [Array<Wgit::Url>] The converted URL's.
17
20
  def to_urls
18
21
  map do |element|
19
22
  process_url_element(element)
@@ -21,7 +24,9 @@ module Enumerable
21
24
  end
22
25
 
23
26
  # Converts each String instance into a Wgit::Url object and returns the
24
- # updated array.
27
+ # updated array. Modifies the receiver.
28
+ #
29
+ # @return [Array<Wgit::Url>] Self containing the converted URL's.
25
30
  def to_urls!
26
31
  map! do |element|
27
32
  process_url_element(element)
@@ -31,6 +36,7 @@ end
31
36
 
32
37
  private
33
38
 
39
+ # Converts the element to a Wgit::Url if the element is a String.
34
40
  def process_url_element(element)
35
41
  if element.is_a? String
36
42
  element.to_url
@@ -3,67 +3,106 @@ require_relative 'document'
3
3
  require_relative 'utils'
4
4
  require_relative 'assertable'
5
5
  require 'net/http' # requires 'uri'
6
-
6
+
7
7
  module Wgit
8
8
 
9
- # @author Michael Telford
10
- # Crawler class provides a means of crawling web URL's.
11
- # Note that any redirects will not be followed for during crawling
12
- # functionality.
9
+ # The Crawler class provides a means of crawling web based URL's, turning
10
+ # their HTML into Wgit::Document's.
11
+ # Note that currently all redirects will not be followed during a crawl.
13
12
  class Crawler
14
13
  include Assertable
15
14
 
16
- attr_reader :urls, :docs
15
+ # The urls to crawl.
16
+ attr_reader :urls
17
+
18
+ # The docs of the crawled @urls.
19
+ attr_reader :docs
17
20
 
18
- def initialize(*urls)
19
- self.urls = urls unless urls.nil?
21
+ # Initializes the Crawler by setting the @urls and @docs.
22
+ #
23
+ # @param urls [*Wgit::Url] The URLs to crawl.
24
+ def initialize(*urls)
25
+ self.[](*urls)
20
26
  @docs = []
21
- end
22
-
27
+ end
28
+
29
+ # Sets this Crawler's @urls.
30
+ #
31
+ # @param urls [Array<Wgit::Url>] The URLs to crawl.
23
32
  def urls=(urls)
24
- @urls = []
25
- Wgit::Utils.each(urls) { |url| add_url(url) }
33
+ @urls = []
34
+ Wgit::Utils.each(urls) { |url| add_url(url) }
26
35
  end
27
36
 
37
+ # Sets this Crawler's @urls.
38
+ #
39
+ # @param urls [*Wgit::Url] The URLs to crawl.
28
40
  def [](*urls)
29
- self.urls = urls unless urls.nil?
41
+ # If urls is nil then add_url (when called later) will set @urls = []
42
+ # so we do nothing here.
43
+ if not urls.nil?
44
+ # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
45
+ # outer array is bogus so we use the inner one only.
46
+ if urls.is_a?(Enumerable) &&
47
+ urls.length == 1 &&
48
+ urls.first.is_a?(Enumerable)
49
+ urls = urls.first
50
+ end
51
+
52
+ # Here we call urls= method using self because the param name is also
53
+ # urls which conflicts.
54
+ self.urls = urls
55
+ end
30
56
  end
31
57
 
58
+ # Adds the url to this Crawler's @urls.
59
+ #
60
+ # @param url [Wgit::Url] A URL to crawl.
32
61
  def <<(url)
33
- add_url(url)
62
+ add_url(url)
34
63
  end
35
-
64
+
36
65
  # Crawls individual urls, not entire sites.
37
- # Returns the last crawled doc.
38
- # Yields each doc to the provided block or adds each doc to @docs
39
- # which can be accessed by Crawler#docs after the method returns.
40
- def crawl_urls(urls = @urls, &block)
66
+ #
67
+ # @param urls [Array<Wgit::Url>] The URLs to crawl.
68
+ # @yield [doc] If provided, the block is given each crawled
69
+ # Document. Otherwise each doc is added to @docs which can be accessed
70
+ # by Crawler#docs after this method returns.
71
+ # @return [Wgit::Document] The last Document crawled.
72
+ def crawl_urls(urls = @urls, &block)
41
73
  raise "No urls to crawl" unless urls
42
74
  @docs = []
43
75
  doc = nil
44
76
  Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
45
77
  doc ? doc : @docs.last
46
- end
47
-
48
- # Crawl the url and return the response document or nil.
49
- # Also yield(doc) if a block is provided. The doc is passed to the block
50
- # regardless of the crawl success so the doc.url can be used if needed.
51
- def crawl_url(url = @urls.first, &block)
52
- assert_type(url, Url)
53
- markup = fetch(url)
78
+ end
79
+
80
+ # Crawl the url and return the response document or nil.
81
+ #
82
+ # @param url [Wgit::Document] The URL to crawl.
83
+ # @yield [doc] The crawled HTML Document regardless if the
84
+ # crawl was successful or not. Therefore, the Document#url can be used.
85
+ # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
86
+ # crawl was unsuccessful.
87
+ def crawl_url(url = @urls.first)
88
+ assert_type(url, Wgit::Url)
89
+ markup = fetch(url)
54
90
  url.crawled = true
55
91
  doc = Wgit::Document.new(url, markup)
56
- block.call(doc) if block_given?
92
+ yield(doc) if block_given?
57
93
  doc.empty? ? nil : doc
58
- end
59
-
94
+ end
95
+
60
96
  # Crawls an entire site by recursively going through its internal_links.
61
- # Also yield(doc) for each crawled doc if a block is provided.
62
- # A block is the only way to interact with the crawled docs.
63
- # Returns a unique array of external urls collected from the site
64
- # or nil if the base_url could not be crawled successfully.
97
+ #
98
+ # @param base_url [Wgit::Url] The base URL of the website to be crawled.
99
+ # @yield [doc] Given each crawled Document/page of the site.
100
+ # A block is the only way to interact with each crawled Document.
101
+ # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
102
+ # from all of the site's pages or nil if the base_url could not be
103
+ # crawled successfully.
65
104
  def crawl_site(base_url = @urls.first, &block)
66
- assert_type(base_url, Url)
105
+ assert_type(base_url, Wgit::Url)
67
106
 
68
107
  doc = crawl_url(base_url, &block)
69
108
  return nil if doc.nil?
@@ -75,7 +114,7 @@ module Wgit
75
114
  return doc.external_links.uniq if internal_urls.empty?
76
115
 
77
116
  loop do
78
- internal_urls.uniq! unless internal_urls.uniq.nil?
117
+ internal_urls.uniq!
79
118
 
80
119
  links = internal_urls - crawled_urls
81
120
  break if links.empty?
@@ -94,36 +133,37 @@ module Wgit
94
133
 
95
134
  private
96
135
 
97
- # Add the document to the @docs array for later processing
98
- # or let the block process it here and now.
136
+ # Add the document to the @docs array for later processing or let the block
137
+ # process it here and now.
99
138
  def handle_crawl_block(url, &block)
100
- if not block_given?
101
- @docs << crawl_url(url)
102
- nil
103
- else
104
- crawl_url(url, &block)
105
- end
139
+ if block_given?
140
+ crawl_url(url, &block)
141
+ else
142
+ @docs << crawl_url(url)
143
+ nil
144
+ end
106
145
  end
107
146
 
108
147
  # The fetch method performs a HTTP GET to obtain the HTML document.
109
- # Invalid urls or any HTTP response that doesn't return a HTML body
110
- # will be ignored and nil will be returned. This means that redirects
111
- # etc. will not be followed.
148
+ # Invalid urls or any HTTP response that doesn't return a HTML body will be
149
+ # ignored and nil will be returned. This means that redirects etc. will
150
+ # not be followed.
112
151
  def fetch(url)
113
- raise unless url.respond_to?(:to_uri)
114
- res = Net::HTTP.get_response(url.to_uri)
115
- res.body.empty? ? nil : res.body
152
+ raise unless url.respond_to?(:to_uri)
153
+ res = Net::HTTP.get_response(url.to_uri)
154
+ res.body.empty? ? nil : res.body
116
155
  rescue
117
- nil
156
+ nil
118
157
  end
119
158
 
159
+ # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
120
160
  def add_url(url)
121
- @urls = [] if @urls.nil?
122
- if url.instance_of?(Url)
123
- @urls << url
124
- else
125
- @urls << Wgit::Url.new(url)
126
- end
161
+ @urls = [] if @urls.nil?
162
+ if url.is_a?(Wgit::Url)
163
+ @urls << url
164
+ else
165
+ @urls << Wgit::Url.new(url)
166
+ end
127
167
  end
128
168
 
129
169
  alias :crawl :crawl_urls
@@ -2,22 +2,19 @@ require_relative '../document'
2
2
  require_relative '../url'
3
3
  require_relative '../utils'
4
4
  require_relative '../assertable'
5
- require_relative 'mongo_connection_details'
6
5
  require_relative 'model'
7
6
  require 'mongo'
8
7
 
9
8
  module Wgit
10
9
 
11
- # @author Michael Telford
12
10
  # Class modeling a DB connection and CRUD operations for the Url and
13
11
  # Document collections.
14
- # The most common methods are: insert, update, urls, search, stats, size.
15
12
  class Database
16
13
  include Assertable
17
-
18
- # Is relative to the root project folder, not this file.
19
- LOG_FILE_PATH = "misc/mongo_log.txt"
20
-
14
+
15
+ # Initializes a database connection client.
16
+ #
17
+ # @raise [RuntimeError] If Wgit::CONNECTION_DETAILS aren't set.
21
18
  def initialize
22
19
  conn_details = Wgit::CONNECTION_DETAILS
23
20
  if conn_details.empty?
@@ -25,146 +22,188 @@ module Wgit
25
22
  :port, :db, :uname, :pword for a database connection to be established."
26
23
  end
27
24
 
28
- logger = Logger.new(LOG_FILE_PATH)
25
+ # Only log to STDOUT in fatal scenarios.
26
+ Mongo::Logger.logger.level = Logger::FATAL
27
+
29
28
  address = "#{conn_details[:host]}:#{conn_details[:port]}"
30
29
  @@client = Mongo::Client.new([address],
31
- :database => conn_details[:db],
32
- :user => conn_details[:uname],
33
- :password => conn_details[:pword],
34
- :logger => logger,
35
- :truncate_logs => false)
30
+ database: conn_details[:db],
31
+ user: conn_details[:uname],
32
+ password: conn_details[:pword])
36
33
  end
37
34
 
38
35
  ### Create Data ###
39
36
 
37
+ # Insert one or more Url or Document objects into the DB.
38
+ #
39
+ # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
40
+ # Wgit::Model.url or Wgit::Model.document.
41
+ # @raise [RuntimeError] If the data is not valid.
40
42
  def insert(data)
41
- if data.is_a?(Url)
42
- insert_urls(data)
43
- elsif data.is_a?(Document)
44
- insert_docs(data)
45
- elsif data.respond_to?(:first)
46
- if data.first.is_a?(Url)
47
- insert_urls(data)
48
- else
49
- insert_docs(data)
50
- end
51
- else
52
- raise "data is not in the correct format (all Url's or Document's)"
53
- end
54
- end
55
-
56
- def insert_urls(url_or_urls)
57
- unless url_or_urls.respond_to?(:map)
58
- assert_type(url_or_urls, Url)
59
- url_or_urls = Wgit::Model.url(url_or_urls)
60
- else
61
- assert_arr_types(url_or_urls, Url)
62
- url_or_urls = url_or_urls.map do |url|
63
- Wgit::Model.url(url)
64
- end
65
- end
66
- create(:urls, url_or_urls)
67
- end
68
-
69
- def insert_docs(doc_or_docs)
70
- unless doc_or_docs.respond_to?(:map)
71
- assert_type(doc_or_docs, [Document, Hash])
72
- unless doc_or_docs.is_a?(Hash)
73
- doc_or_docs = Wgit::Model.document(doc_or_docs)
74
- end
43
+ if data.is_a?(Url)
44
+ insert_urls(data)
45
+ elsif data.is_a?(Document)
46
+ insert_docs(data)
47
+ elsif data.respond_to?(:first)
48
+ if data.first.is_a?(Url)
49
+ insert_urls(data)
75
50
  else
76
- assert_arr_types(doc_or_docs, [Document, Hash])
77
- doc_or_docs = doc_or_docs.map do |doc|
78
- Wgit::Model.document(doc) unless doc.is_a?(Hash)
79
- end
51
+ insert_docs(data)
80
52
  end
81
- create(:documents, doc_or_docs)
53
+ else
54
+ raise "data is not in the correct format (all Url's or Document's)"
55
+ end
82
56
  end
83
57
 
84
58
  ### Retrieve Data ###
85
59
 
86
- # A crawled parameter value of nil (the default) returns all urls.
87
- # A limit of 0 means all urls are returned.
88
- # All urls are sorted by date_added ascending, in other words the first
89
- # url in the results is the first added.
90
- def urls(crawled = nil, limit = 0, skip = 0, &block)
91
- crawled.nil? ? query = {} : query = { :crawled => crawled }
60
+ # Returns Url records from the DB. All Urls are sorted by date_added
61
+ # ascending, in other words the first url returned is the first one that
62
+ # was inserted into the DB.
63
+ #
64
+ # @param crawled [Boolean] Filter by Url#crawled value. nil returns all.
65
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
66
+ # @param skip [Integer] Skip n amount of Url's.
67
+ # @yield [url] Given each Url returned from the DB.
68
+ # @return [Array<Wgit::Url>] The Urls obtained from the DB.
69
+ def urls(crawled = nil, limit = 0, skip = 0)
70
+ crawled.nil? ? query = {} : query = { crawled: crawled }
92
71
 
93
- sort = { :date_added => 1 }
72
+ sort = { date_added: 1 }
94
73
  results = retrieve(:urls, query, sort, {}, limit, skip)
95
74
  return [] if results.count < 1
96
75
 
97
76
  # results.respond_to? :map! is false so we use map and overwrite the var.
98
77
  results = results.map { |url_doc| Wgit::Url.new(url_doc) }
99
- return results unless block_given?
100
- results.each { |url| block.call(url) }
78
+ results.each { |url| yield(url) } if block_given?
79
+
80
+ results
101
81
  end
102
82
 
83
+ # Returns Url records that have been crawled.
84
+ #
85
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
86
+ # @param skip [Integer] Skip n amount of Url's.
87
+ # @yield [url] Given each Url returned from the DB.
88
+ # @return [Array<Wgit::Url>] The crawled Urls obtained from the DB.
103
89
  def crawled_urls(limit = 0, skip = 0, &block)
104
90
  urls(true, limit, skip, &block)
105
91
  end
106
-
92
+
93
+ # Returned Url records that haven't been crawled. Each Url is yielded to a
94
+ # block, if given.
95
+ #
96
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
97
+ # @param skip [Integer] Skip n amount of Url's.
98
+ # @yield [url] Given each Url returned from the DB.
99
+ # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
107
100
  def uncrawled_urls(limit = 0, skip = 0, &block)
108
101
  urls(false, limit, skip, &block)
109
102
  end
110
103
 
104
+ # Searches against the indexed docs in the DB for the given query.
105
+ #
111
106
  # Currently all searches are case insensitive.
112
107
  #
113
- # Searches against the indexed docs in the DB for the given text.
114
- # The searched fields are decided by the text index setup against the
108
+ # The searched fields are decided by the text index setup against the
115
109
  # documents collection. Currently we search against the following fields:
116
110
  # "author", "keywords", "title" and "text".
117
111
  #
118
- # The MongoDB search ranks/sorts the results in order (highest first) based
119
- # upon each documents textScore which records the number of text hits. We
120
- # then store this textScore in each Document object for use elsewhere if
121
- # needed.
112
+ # The MongoDB search ranks/sorts the results in order (highest first) based
113
+ # upon each documents textScore which records the number of query hits. We
114
+ # then store this textScore in each Document result object for use
115
+ # elsewhere if needed.
122
116
  #
123
- # @param text [String] the value to search the data against.
124
- # @param whole_sentence [Boolean] whether multiple words should be
125
- # searched for separately.
126
- # @param limit [Fixnum] the max length/count of the results array.
127
- # @param skip [Fixnum] the number of results to skip, starting with the
128
- # most relevant based upon the textScore of the search.
129
- # @param block [Block] a block which if provided is passed to each result.
130
- #
131
- # @return [Array] of Document objects representing the search results.
132
- def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
133
- text.strip!
134
- text.replace("\"" + text + "\"") if whole_sentence
117
+ # @param query [String] The text query to search with.
118
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
119
+ # for separately.
120
+ # @param limit [Integer] The max number of results to return.
121
+ # @param skip [Integer] The number of DB records to skip.
122
+ # @yield [doc] Given each search result (Wgit::Document).
123
+ # @return [Array<Wgit::Document>] The search results obtained from the DB.
124
+ def search(query, whole_sentence = false, limit = 10, skip = 0)
125
+ query.strip!
126
+ query.replace("\"" + query + "\"") if whole_sentence
135
127
 
136
- # The textScore sorts based on the most search hits.
137
- # We use the textScore hash as a sort and a projection below.
138
- # :$caseSensitive => case_sensitive, # 3.2+ only.
139
- sort_proj = { :score => { :$meta => "textScore" } }
140
- query = { :$text => { :$search => text } }
128
+ # The sort_proj sorts based on the most search hits.
129
+ # We use the sort_proj hash as both a sort and a projection below.
130
+ # :$caseSensitive => case_sensitive, 3.2+ only.
131
+ sort_proj = { score: { :$meta => "textScore" } }
132
+ query = { :$text => { :$search => query } }
133
+
141
134
  results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
142
-
143
- return [] if results.count < 1
135
+ return [] if results.count < 1 # respond_to? :empty? == false
136
+
144
137
  # results.respond_to? :map! is false so we use map and overwrite the var.
145
138
  results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
146
- return results unless block_given?
147
- results.each { |doc| block.call(doc) }
139
+ results.each { |doc| yield(doc) } if block_given?
140
+
141
+ results
148
142
  end
149
143
 
150
- # Performs a search and pretty prints the results.
151
- def search_p(text, whole_sentence = false, limit = 10,
152
- skip = 0, sentence_length = 80, &block)
153
- results = search(text, whole_sentence, limit, skip, &block)
154
- Wgit::Utils.printf_search_results(results, text, false, sentence_length)
155
- end
156
-
157
- # Returns a Mongo object which can be used like a Hash to retrieve values.
144
+ # Returns statistics about the database.
145
+ #
146
+ # @return [BSON::Document#[]#fetch] Similar to a Hash instance.
158
147
  def stats
159
- @@client.command(:dbStats => 0).documents[0]
148
+ @@client.command(dbStats: 0).documents[0]
160
149
  end
161
150
 
151
+ # Returns the current size of the database.
152
+ #
153
+ # @return [Integer] The current size of the DB.
162
154
  def size
163
- stats[:dataSize]
155
+ stats[:dataSize]
164
156
  end
165
-
157
+
158
+ # Returns the total number of URL records in the DB.
159
+ #
160
+ # @return [Integer] The current number of URL records.
161
+ def num_urls
162
+ @@client[:urls].count
163
+ end
164
+
165
+ # Returns the total number of Document records in the DB.
166
+ #
167
+ # @return [Integer] The current number of Document records.
168
+ def num_docs
169
+ @@client[:documents].count
170
+ end
171
+
172
+ # Returns the total number of records (urls + docs) in the DB.
173
+ #
174
+ # @return [Integer] The current number of URL and Document records.
175
+ def num_records
176
+ num_urls + num_docs
177
+ end
178
+
179
+ # Returns whether or not a record with the given url (which is unique)
180
+ # exists in the database's 'urls' collection.
181
+ #
182
+ # @param url [Wgit::Url] The Url to search the DB for.
183
+ # @return [Boolean] True if url exists, otherwise false.
184
+ def url?(url)
185
+ h = { "url" => url }
186
+ not @@client[:urls].find(h).none?
187
+ end
188
+
189
+ # Returns whether or not a record with the given doc.url (which is unique)
190
+ # exists in the database's 'documents' collection.
191
+ #
192
+ # @param doc [Wgit::Document] The Document to search the DB for.
193
+ # @return [Boolean] True if doc exists, otherwise false.
194
+ def doc?(doc)
195
+ url = doc.respond_to?(:url) ? doc.url : doc
196
+ h = { "url" => url }
197
+ not @@client[:documents].find(h).none?
198
+ end
199
+
166
200
  ### Update Data ###
167
201
 
202
+ # Update a Url or Document object in the DB.
203
+ #
204
+ # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
205
+ # Wgit::Model.url or Wgit::Model.document.
206
+ # @raise [RuntimeError] If the data is not valid.
168
207
  def update(data)
169
208
  if data.is_a?(Url)
170
209
  update_url(data)
@@ -174,96 +213,134 @@ module Wgit
174
213
  raise "data is not in the correct format (all Url's or Document's)"
175
214
  end
176
215
  end
177
-
178
- def update_url(url)
179
- assert_type(url, Url)
180
- selection = { :url => url }
181
- url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
182
- update = { "$set" => url_hash }
183
- _update(true, :urls, selection, update)
184
- end
185
-
186
- def update_doc(doc)
187
- assert_type(doc, Document)
188
- selection = { :url => doc.url }
189
- doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
190
- update = { "$set" => doc_hash }
191
- _update(true, :documents, selection, update)
192
- end
193
-
194
- private
195
216
 
217
+ private
218
+
219
+ # Return if the write to the DB succeeded or not.
196
220
  def write_succeeded?(result, count = 1, multi = false)
197
- case result.class.to_s
198
- # Single create result.
199
- when "Mongo::Operation::Write::Insert::Result"
200
- result.documents.first[:err].nil?
201
- # Multiple create result.
202
- when "Mongo::BulkWrite::Result"
203
- result.inserted_count == count
204
- # Single and multiple update result.
205
- when "Mongo::Operation::Write::Update::Result", # MongoDB 3.0
206
- "Mongo::Operation::Write::Update::LegacyResult" # MongoDB 2.4
207
- if multi
208
- result.n == count
209
- else
210
- result.documents.first[:err].nil?
211
- end
221
+ case result.class.to_s
222
+ # Single create result.
223
+ when "Mongo::Operation::Insert::Result"
224
+ result.documents.first[:err].nil?
225
+ # Multiple create result.
226
+ when "Mongo::BulkWrite::Result"
227
+ result.inserted_count == count
228
+ # Single and multiple update result.
229
+ when "Mongo::Operation::Update::Result"
230
+ if multi
231
+ result.n == count
212
232
  else
213
- raise "Result class not currently supported: #{result.class.to_s}"
233
+ result.documents.first[:err].nil?
214
234
  end
235
+ # Class no longer used, have you upgraded the 'mongo' gem?
236
+ else
237
+ raise "Result class not currently supported: #{result.class.to_s}"
238
+ end
239
+ end
240
+
241
+ # Insert one or more Url objects into the DB.
242
+ def insert_urls(url_or_urls)
243
+ unless url_or_urls.respond_to?(:map)
244
+ assert_type(url_or_urls, Url)
245
+ url_or_urls = Wgit::Model.url(url_or_urls)
246
+ else
247
+ assert_arr_types(url_or_urls, Url)
248
+ url_or_urls = url_or_urls.map do |url|
249
+ Wgit::Model.url(url)
250
+ end
251
+ end
252
+ create(:urls, url_or_urls)
215
253
  end
216
254
 
217
- def create(collection, data)
218
- assert_type(data, [Hash, Array])
219
- # Single doc.
220
- if data.is_a?(Hash)
221
- data.merge!(Wgit::Model.common_insert_data)
222
- result = @@client[collection.to_sym].insert_one(data)
223
- unless write_succeeded?(result)
224
- raise "DB write (insert) failed"
225
- end
226
- result.n
227
- # Multiple docs.
228
- elsif data.is_a?(Array)
229
- assert_arr_types(data, Hash)
230
- data.map! do |data_hash|
231
- data_hash.merge(Wgit::Model.common_insert_data)
232
- end
233
- result = @@client[collection.to_sym].insert_many(data)
234
- unless write_succeeded?(result, data.length)
235
- raise "DB write(s) failed"
236
- end
237
- result.inserted_count
238
- else
239
- raise "data must be a Hash or an Array of Hash's"
255
+ # Insert one or more Document objects into the DB.
256
+ def insert_docs(doc_or_docs)
257
+ unless doc_or_docs.respond_to?(:map)
258
+ assert_type(doc_or_docs, [Document, Hash])
259
+ unless doc_or_docs.is_a?(Hash)
260
+ doc_or_docs = Wgit::Model.document(doc_or_docs)
240
261
  end
262
+ else
263
+ assert_arr_types(doc_or_docs, [Document, Hash])
264
+ doc_or_docs = doc_or_docs.map do |doc|
265
+ Wgit::Model.document(doc) unless doc.is_a?(Hash)
266
+ end
267
+ end
268
+ create(:documents, doc_or_docs)
241
269
  end
242
270
 
243
- def retrieve(collection, query, sort = {}, projection = {},
271
+ # Create/insert one or more Url or Document records into the DB.
272
+ def create(collection, data)
273
+ assert_type(data, [Hash, Array])
274
+ # Single doc.
275
+ if data.is_a?(Hash)
276
+ data.merge!(Wgit::Model.common_insert_data)
277
+ result = @@client[collection.to_sym].insert_one(data)
278
+ unless write_succeeded?(result)
279
+ raise "DB write (insert) failed"
280
+ end
281
+ result.n
282
+ # Multiple docs.
283
+ elsif data.is_a?(Array)
284
+ assert_arr_types(data, Hash)
285
+ data.map! do |data_hash|
286
+ data_hash.merge(Wgit::Model.common_insert_data)
287
+ end
288
+ result = @@client[collection.to_sym].insert_many(data)
289
+ unless write_succeeded?(result, data.length)
290
+ raise "DB write(s) failed"
291
+ end
292
+ result.inserted_count
293
+ else
294
+ raise "data must be a Hash or an Array of Hash's"
295
+ end
296
+ end
297
+
298
+ # Retrieve Url or Document records from the DB.
299
+ def retrieve(collection, query,
300
+ sort = {}, projection = {},
244
301
  limit = 0, skip = 0)
245
- assert_type(query, Hash)
246
- @@client[collection.to_sym].find(query).projection(projection)
247
- .skip(skip).limit(limit).sort(sort)
302
+ assert_type(query, Hash)
303
+ @@client[collection.to_sym].find(query).projection(projection)
304
+ .skip(skip).limit(limit).sort(sort)
305
+ end
306
+
307
+ # Update a Url object in the DB.
308
+ def update_url(url)
309
+ assert_type(url, Url)
310
+ selection = { url: url }
311
+ url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
312
+ update = { "$set" => url_hash }
313
+ _update(true, :urls, selection, update)
314
+ end
315
+
316
+ # Update a Document object in the DB.
317
+ def update_doc(doc)
318
+ assert_type(doc, Document)
319
+ selection = { url: doc.url }
320
+ doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
321
+ update = { "$set" => doc_hash }
322
+ _update(true, :documents, selection, update)
248
323
  end
249
324
 
325
+ # Update one or more Url or Document records in the DB.
250
326
  # NOTE: The Model.common_update_data should be merged in the calling
251
327
  # method as the update param can be bespoke due to its nature.
252
328
  def _update(single, collection, selection, update)
253
- assert_arr_types([selection, update], Hash)
254
- if single
255
- result = @@client[collection.to_sym].update_one(selection, update)
256
- else
257
- result = @@client[collection.to_sym].update_many(selection, update)
258
- end
259
- raise "DB write (update) failed" unless write_succeeded?(result)
260
- result.n
329
+ assert_arr_types([selection, update], Hash)
330
+ if single
331
+ result = @@client[collection.to_sym].update_one(selection, update)
332
+ else
333
+ result = @@client[collection.to_sym].update_many(selection, update)
334
+ end
335
+ raise "DB write (update) failed" unless write_succeeded?(result)
336
+ result.n
261
337
  end
262
338
 
263
339
  alias :count :size
264
340
  alias :length :size
341
+ alias :num_documents :num_docs
342
+ alias :document? :doc?
265
343
  alias :insert_url :insert_urls
266
344
  alias :insert_doc :insert_docs
267
- alias :search_and_format :search_p
268
345
  end
269
346
  end