wgit 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/crawler.rb CHANGED
@@ -1,188 +1,188 @@
1
- require_relative 'url'
2
- require_relative 'document'
3
- require_relative 'utils'
4
- require_relative 'assertable'
5
- require 'net/http' # requires 'uri'
6
-
7
- module Wgit
8
-
9
- # The Crawler class provides a means of crawling web based URL's, turning
10
- # their HTML into Wgit::Document's.
11
- class Crawler
12
- include Assertable
13
-
14
- # The urls to crawl.
15
- attr_reader :urls
16
-
17
- # The docs of the crawled @urls.
18
- attr_reader :docs
19
-
20
- # Initializes the Crawler by setting the @urls and @docs.
21
- #
22
- # @param urls [*Wgit::Url] The URLs to crawl.
23
- def initialize(*urls)
24
- self.[](*urls)
25
- @docs = []
26
- end
27
-
28
- # Sets this Crawler's @urls.
29
- #
30
- # @param urls [Array<Wgit::Url>] The URLs to crawl.
31
- def urls=(urls)
32
- @urls = []
33
- Wgit::Utils.each(urls) { |url| add_url(url) }
34
- end
35
-
36
- # Sets this Crawler's @urls.
37
- #
38
- # @param urls [*Wgit::Url] The URLs to crawl.
39
- def [](*urls)
40
- # If urls is nil then add_url (when called later) will set @urls = []
41
- # so we do nothing here.
42
- if not urls.nil?
43
- # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
44
- # outer array is bogus so we use the inner one only.
45
- if urls.is_a?(Enumerable) &&
46
- urls.length == 1 &&
47
- urls.first.is_a?(Enumerable)
48
- urls = urls.first
49
- end
50
-
51
- # Here we call urls= method using self because the param name is also
52
- # urls which conflicts.
53
- self.urls = urls
54
- end
55
- end
56
-
57
- # Adds the url to this Crawler's @urls.
58
- #
59
- # @param url [Wgit::Url] A URL to crawl.
60
- def <<(url)
61
- add_url(url)
62
- end
63
-
64
- # Crawls individual urls, not entire sites.
65
- #
66
- # @param urls [Array<Wgit::Url>] The URLs to crawl.
67
- # @yield [doc] If provided, the block is given each crawled
68
- # Document. Otherwise each doc is added to @docs which can be accessed
69
- # by Crawler#docs after this method returns.
70
- # @return [Wgit::Document] The last Document crawled.
71
- def crawl_urls(urls = @urls, &block)
72
- raise "No urls to crawl" unless urls
73
- @docs = []
74
- doc = nil
75
- Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
76
- doc ? doc : @docs.last
77
- end
78
-
79
- # Crawl the url and return the response document or nil.
80
- #
81
- # @param url [Wgit::Document] The URL to crawl.
82
- # @yield [doc] The crawled HTML Document regardless if the
83
- # crawl was successful or not. Therefore, the Document#url can be used.
84
- # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
85
- # crawl was unsuccessful.
86
- def crawl_url(url = @urls.first)
87
- assert_type(url, Wgit::Url)
88
- markup = fetch(url)
89
- url.crawled = true
90
- doc = Wgit::Document.new(url, markup)
91
- yield(doc) if block_given?
92
- doc.empty? ? nil : doc
93
- end
94
-
95
- # Crawls an entire site by recursively going through its internal_links.
96
- #
97
- # @param base_url [Wgit::Url] The base URL of the website to be crawled.
98
- # @yield [doc] Given each crawled Document/page of the site.
99
- # A block is the only way to interact with each crawled Document.
100
- # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
101
- # from all of the site's pages or nil if the base_url could not be
102
- # crawled successfully.
103
- def crawl_site(base_url = @urls.first, &block)
104
- assert_type(base_url, Wgit::Url)
105
-
106
- doc = crawl_url(base_url, &block)
107
- return nil if doc.nil?
108
-
109
- path = base_url.path.empty? ? '/' : base_url.path
110
- crawled_urls = [path]
111
- external_urls = doc.external_links
112
- internal_urls = doc.internal_links
113
-
114
- return doc.external_links.uniq if internal_urls.empty?
115
-
116
- loop do
117
- internal_urls.uniq!
118
-
119
- links = internal_urls - crawled_urls
120
- break if links.empty?
121
-
122
- links.each do |link|
123
- doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
124
- crawled_urls << link
125
- next if doc.nil?
126
- internal_urls.concat(doc.internal_links)
127
- external_urls.concat(doc.external_links)
128
- end
129
- end
130
-
131
- external_urls.uniq
132
- end
133
-
134
- private
135
-
136
- # Add the document to the @docs array for later processing or let the block
137
- # process it here and now.
138
- def handle_crawl_block(url, &block)
139
- if block_given?
140
- crawl_url(url, &block)
141
- else
142
- @docs << crawl_url(url)
143
- nil
144
- end
145
- end
146
-
147
- # The fetch method performs a HTTP GET to obtain the HTML document.
148
- # Invalid urls or any HTTP response that doesn't return a HTML body will be
149
- # ignored and nil will be returned. Otherwise, the HTML is returned.
150
- def fetch(url)
151
- response = resolve(url)
152
- response.body.empty? ? nil : response.body
153
- rescue Exception => ex
154
- Wgit.logger.debug(
155
- "Wgit::Crawler#fetch('#{url}') exception: #{ex.message}"
156
- )
157
- nil
158
- end
159
-
160
- # The resolve method performs a HTTP GET to obtain the HTML document.
161
- # A certain amount of redirects will be followed by default before raising
162
- # an exception. Redirects can be disabled by setting `redirect_limit: 1`.
163
- # The Net::HTTPResponse will be returned.
164
- def resolve(url, redirect_limit: 5)
165
- redirect_count = -1
166
- begin
167
- raise "Too many redirects" if redirect_count >= redirect_limit
168
- redirect_count += 1
169
-
170
- response = Net::HTTP.get_response(URI(url))
171
- location = Wgit::Url.new(response.fetch('location', ''))
172
- if not location.empty?
173
- url = location.is_relative? ? url.to_base.concat(location) : location
174
- end
175
- end while response.is_a?(Net::HTTPRedirection)
176
- response
177
- end
178
-
179
- # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
180
- def add_url(url)
181
- @urls = [] if @urls.nil?
182
- @urls << Wgit::Url.new(url)
183
- end
184
-
185
- alias :crawl :crawl_urls
186
- alias :crawl_r :crawl_site
187
- end
188
- end
1
+ require_relative 'url'
2
+ require_relative 'document'
3
+ require_relative 'utils'
4
+ require_relative 'assertable'
5
+ require 'net/http' # requires 'uri'
6
+
7
+ module Wgit
8
+
9
+ # The Crawler class provides a means of crawling web based URL's, turning
10
+ # their HTML into Wgit::Document's.
11
+ class Crawler
12
+ include Assertable
13
+
14
+ # The urls to crawl.
15
+ attr_reader :urls
16
+
17
+ # The docs of the crawled @urls.
18
+ attr_reader :docs
19
+
20
+ # Initializes the Crawler by setting the @urls and @docs.
21
+ #
22
+ # @param urls [*Wgit::Url] The URLs to crawl.
23
+ def initialize(*urls)
24
+ self.[](*urls)
25
+ @docs = []
26
+ end
27
+
28
+ # Sets this Crawler's @urls.
29
+ #
30
+ # @param urls [Array<Wgit::Url>] The URLs to crawl.
31
+ def urls=(urls)
32
+ @urls = []
33
+ Wgit::Utils.each(urls) { |url| add_url(url) }
34
+ end
35
+
36
+ # Sets this Crawler's @urls.
37
+ #
38
+ # @param urls [*Wgit::Url] The URLs to crawl.
39
+ def [](*urls)
40
+ # If urls is nil then add_url (when called later) will set @urls = []
41
+ # so we do nothing here.
42
+ if not urls.nil?
43
+ # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
44
+ # outer array is bogus so we use the inner one only.
45
+ if urls.is_a?(Enumerable) &&
46
+ urls.length == 1 &&
47
+ urls.first.is_a?(Enumerable)
48
+ urls = urls.first
49
+ end
50
+
51
+ # Here we call urls= method using self because the param name is also
52
+ # urls which conflicts.
53
+ self.urls = urls
54
+ end
55
+ end
56
+
57
+ # Adds the url to this Crawler's @urls.
58
+ #
59
+ # @param url [Wgit::Url] A URL to crawl.
60
+ def <<(url)
61
+ add_url(url)
62
+ end
63
+
64
+ # Crawls individual urls, not entire sites.
65
+ #
66
+ # @param urls [Array<Wgit::Url>] The URLs to crawl.
67
+ # @yield [doc] If provided, the block is given each crawled
68
+ # Document. Otherwise each doc is added to @docs which can be accessed
69
+ # by Crawler#docs after this method returns.
70
+ # @return [Wgit::Document] The last Document crawled.
71
+ def crawl_urls(urls = @urls, &block)
72
+ raise "No urls to crawl" unless urls
73
+ @docs = []
74
+ doc = nil
75
+ Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
76
+ doc ? doc : @docs.last
77
+ end
78
+
79
+ # Crawl the url and return the response document or nil.
80
+ #
81
+ # @param url [Wgit::Document] The URL to crawl.
82
+ # @yield [doc] The crawled HTML Document regardless if the
83
+ # crawl was successful or not. Therefore, the Document#url can be used.
84
+ # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
85
+ # crawl was unsuccessful.
86
+ def crawl_url(url = @urls.first)
87
+ assert_type(url, Wgit::Url)
88
+ markup = fetch(url)
89
+ url.crawled = true
90
+ doc = Wgit::Document.new(url, markup)
91
+ yield(doc) if block_given?
92
+ doc.empty? ? nil : doc
93
+ end
94
+
95
+ # Crawls an entire site by recursively going through its internal_links.
96
+ #
97
+ # @param base_url [Wgit::Url] The base URL of the website to be crawled.
98
+ # @yield [doc] Given each crawled Document/page of the site.
99
+ # A block is the only way to interact with each crawled Document.
100
+ # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
101
+ # from all of the site's pages or nil if the base_url could not be
102
+ # crawled successfully.
103
+ def crawl_site(base_url = @urls.first, &block)
104
+ assert_type(base_url, Wgit::Url)
105
+
106
+ doc = crawl_url(base_url, &block)
107
+ return nil if doc.nil?
108
+
109
+ path = base_url.path.nil? ? '/' : base_url.path
110
+ crawled_urls = [path]
111
+ external_urls = doc.external_links
112
+ internal_urls = doc.internal_links
113
+
114
+ return doc.external_links.uniq if internal_urls.empty?
115
+
116
+ loop do
117
+ internal_urls.uniq!
118
+
119
+ links = internal_urls - crawled_urls
120
+ break if links.empty?
121
+
122
+ links.each do |link|
123
+ doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
124
+ crawled_urls << link
125
+ next if doc.nil?
126
+ internal_urls.concat(doc.internal_links)
127
+ external_urls.concat(doc.external_links)
128
+ end
129
+ end
130
+
131
+ external_urls.uniq
132
+ end
133
+
134
+ private
135
+
136
+ # Add the document to the @docs array for later processing or let the block
137
+ # process it here and now.
138
+ def handle_crawl_block(url, &block)
139
+ if block_given?
140
+ crawl_url(url, &block)
141
+ else
142
+ @docs << crawl_url(url)
143
+ nil
144
+ end
145
+ end
146
+
147
+ # The fetch method performs a HTTP GET to obtain the HTML document.
148
+ # Invalid urls or any HTTP response that doesn't return a HTML body will be
149
+ # ignored and nil will be returned. Otherwise, the HTML is returned.
150
+ def fetch(url)
151
+ response = resolve(url)
152
+ response.body.empty? ? nil : response.body
153
+ rescue Exception => ex
154
+ Wgit.logger.debug(
155
+ "Wgit::Crawler#fetch('#{url}') exception: #{ex.message}"
156
+ )
157
+ nil
158
+ end
159
+
160
+ # The resolve method performs a HTTP GET to obtain the HTML document.
161
+ # A certain amount of redirects will be followed by default before raising
162
+ # an exception. Redirects can be disabled by setting `redirect_limit: 1`.
163
+ # The Net::HTTPResponse will be returned.
164
+ def resolve(url, redirect_limit: 5)
165
+ redirect_count = -1
166
+ begin
167
+ raise "Too many redirects" if redirect_count >= redirect_limit
168
+ redirect_count += 1
169
+
170
+ response = Net::HTTP.get_response(URI(url))
171
+ location = Wgit::Url.new(response.fetch('location', ''))
172
+ if not location.empty?
173
+ url = location.is_relative? ? url.to_base.concat(location) : location
174
+ end
175
+ end while response.is_a?(Net::HTTPRedirection)
176
+ response
177
+ end
178
+
179
+ # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
180
+ def add_url(url)
181
+ @urls = [] if @urls.nil?
182
+ @urls << Wgit::Url.new(url)
183
+ end
184
+
185
+ alias :crawl :crawl_urls
186
+ alias :crawl_r :crawl_site
187
+ end
188
+ end
@@ -8,7 +8,7 @@ require 'mongo'
8
8
 
9
9
  module Wgit
10
10
 
11
- # Class modeling a DB connection and CRUD operations for the Url and
11
+ # Class modeling a DB connection and CRUD operations for the Url and
12
12
  # Document collections.
13
13
  class Database
14
14
  include Assertable
@@ -19,7 +19,7 @@ module Wgit
19
19
  def initialize
20
20
  conn_details = Wgit::CONNECTION_DETAILS
21
21
  if conn_details.empty?
22
- raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
22
+ raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
23
23
  :port, :db, :uname, :pword for a database connection to be established."
24
24
  end
25
25
 
@@ -29,14 +29,14 @@ module Wgit
29
29
  Mongo::Logger.logger.level = Logger::ERROR
30
30
 
31
31
  address = "#{conn_details[:host]}:#{conn_details[:port]}"
32
- @@client = Mongo::Client.new([address],
32
+ @@client = Mongo::Client.new([address],
33
33
  database: conn_details[:db],
34
34
  user: conn_details[:uname],
35
35
  password: conn_details[:pword])
36
36
  end
37
37
 
38
38
  ### Create Data ###
39
-
39
+
40
40
  # Insert one or more Url or Document objects into the DB.
41
41
  #
42
42
  # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
@@ -57,9 +57,9 @@ module Wgit
57
57
  raise "data is not in the correct format (all Url's or Document's)"
58
58
  end
59
59
  end
60
-
60
+
61
61
  ### Retrieve Data ###
62
-
62
+
63
63
  # Returns Url records from the DB. All Urls are sorted by date_added
64
64
  # ascending, in other words the first url returned is the first one that
65
65
  # was inserted into the DB.
@@ -71,18 +71,18 @@ module Wgit
71
71
  # @return [Array<Wgit::Url>] The Urls obtained from the DB.
72
72
  def urls(crawled = nil, limit = 0, skip = 0)
73
73
  crawled.nil? ? query = {} : query = { crawled: crawled }
74
-
74
+
75
75
  sort = { date_added: 1 }
76
76
  results = retrieve(:urls, query, sort, {}, limit, skip)
77
77
  return [] if results.count < 1
78
-
78
+
79
79
  # results.respond_to? :map! is false so we use map and overwrite the var.
80
80
  results = results.map { |url_doc| Wgit::Url.new(url_doc) }
81
81
  results.each { |url| yield(url) } if block_given?
82
-
82
+
83
83
  results
84
84
  end
85
-
85
+
86
86
  # Returns Url records that have been crawled.
87
87
  #
88
88
  # @param limit [Integer] The max number of Url's to return. 0 returns all.
@@ -127,20 +127,20 @@ module Wgit
127
127
  def search(query, whole_sentence = false, limit = 10, skip = 0)
128
128
  query.strip!
129
129
  query.replace("\"" + query + "\"") if whole_sentence
130
-
130
+
131
131
  # The sort_proj sorts based on the most search hits.
132
132
  # We use the sort_proj hash as both a sort and a projection below.
133
133
  # :$caseSensitive => case_sensitive, 3.2+ only.
134
134
  sort_proj = { score: { :$meta => "textScore" } }
135
135
  query = { :$text => { :$search => query } }
136
-
136
+
137
137
  results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
138
138
  return [] if results.count < 1 # respond_to? :empty? == false
139
-
139
+
140
140
  # results.respond_to? :map! is false so we use map and overwrite the var.
141
141
  results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
142
142
  results.each { |doc| yield(doc) } if block_given?
143
-
143
+
144
144
  results
145
145
  end
146
146
 
@@ -150,7 +150,7 @@ module Wgit
150
150
  def stats
151
151
  @@client.command(dbStats: 0).documents[0]
152
152
  end
153
-
153
+
154
154
  # Returns the current size of the database.
155
155
  #
156
156
  # @return [Integer] The current size of the DB.
@@ -201,7 +201,7 @@ module Wgit
201
201
  end
202
202
 
203
203
  ### Update Data ###
204
-
204
+
205
205
  # Update a Url or Document object in the DB.
206
206
  #
207
207
  # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
@@ -254,7 +254,7 @@ module Wgit
254
254
  end
255
255
  create(:urls, url_or_urls)
256
256
  end
257
-
257
+
258
258
  # Insert one or more Document objects into the DB.
259
259
  def insert_docs(doc_or_docs)
260
260
  unless doc_or_docs.respond_to?(:map)
@@ -270,7 +270,7 @@ module Wgit
270
270
  end
271
271
  create(:documents, doc_or_docs)
272
272
  end
273
-
273
+
274
274
  # Create/insert one or more Url or Document records into the DB.
275
275
  def create(collection, data)
276
276
  assert_type(data, [Hash, Array])
@@ -324,9 +324,9 @@ module Wgit
324
324
  update = { "$set" => doc_hash }
325
325
  _update(true, :documents, selection, update)
326
326
  end
327
-
327
+
328
328
  # Update one or more Url or Document records in the DB.
329
- # NOTE: The Model.common_update_data should be merged in the calling
329
+ # NOTE: The Model.common_update_data should be merged in the calling
330
330
  # method as the update param can be bespoke due to its nature.
331
331
  def _update(single, collection, selection, update)
332
332
  assert_arr_types([selection, update], Hash)
@@ -338,12 +338,13 @@ module Wgit
338
338
  raise "DB write (update) failed" unless write_succeeded?(result)
339
339
  result.n
340
340
  end
341
-
341
+
342
342
  alias :count :size
343
343
  alias :length :size
344
344
  alias :num_documents :num_docs
345
345
  alias :document? :doc?
346
346
  alias :insert_url :insert_urls
347
347
  alias :insert_doc :insert_docs
348
+ alias :num_objects :num_records
348
349
  end
349
350
  end