wgit 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,235 @@
1
+ require_relative 'crawler'
2
+ require_relative 'database/database'
3
+
4
+ module Wgit
5
+
6
+ # Convience method to index the World Wide Web using
7
+ # Wgit::Indexer#index_the_web.
8
+ #
9
+ # Retrieves uncrawled url's from the database and recursively crawls each
10
+ # site storing their internal pages into the database and adding their
11
+ # external url's to be crawled at a later date. Puts out info on the crawl
12
+ # to STDOUT as it goes along.
13
+ #
14
+ # @param max_sites_to_crawl [Integer] The number of separate and whole
15
+ # websites to be crawled before the method exits. Defaults to -1 which
16
+ # means the crawl will occur until manually stopped (Ctrl+C etc).
17
+ # @param max_data_size [Integer] The maximum amount of bytes that will be
18
+ # scraped from the web (default is 1GB). Note, that this value is used to
19
+ # determine when to stop crawling; it's not a guarantee of the max data
20
+ # that will be obtained.
21
+ def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
22
+ db = Wgit::Database.new
23
+ indexer = Wgit::Indexer.new(db)
24
+ indexer.index_the_web(max_sites_to_crawl, max_data_size)
25
+ end
26
+
27
+ # Convience method to index a single website using
28
+ # Wgit::Indexer#index_this_site.
29
+ #
30
+ # Crawls a single website's pages and stores them into the database.
31
+ # There is no max download limit so be careful which sites you index.
32
+ #
33
+ # @param url [Wgit::Url, String] The base Url of the website to crawl.
34
+ # @param insert_externals [Boolean] Whether or not to insert the website's
35
+ # external Url's into the database.
36
+ # @yield [doc] Given the Wgit::Document of each crawled web page, before it
37
+ # is inserted into the database allowing for prior manipulation.
38
+ # @return [Integer] The total number of pages crawled within the website.
39
+ def self.index_this_site(url, insert_externals = true, &block)
40
+ url = Wgit::Url.new url
41
+ db = Wgit::Database.new
42
+ indexer = Wgit::Indexer.new(db)
43
+ indexer.index_this_site(url, insert_externals, &block)
44
+ end
45
+
46
+ # Performs a search of the database's indexed documents and pretty prints
47
+ # the results. See Wgit::Database#search for details of the search.
48
+ #
49
+ # @param query [String] The text query to search with.
50
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
51
+ # for separately.
52
+ # @param limit [Integer] The max number of results to return.
53
+ # @param skip [Integer] The number of DB records to skip.
54
+ # @param sentence_length [Integer] The max length of each result's text
55
+ # snippet.
56
+ # @yield [doc] Given each search result (Wgit::Document).
57
+ def self.indexed_search(query, whole_sentence = false, limit = 10,
58
+ skip = 0, sentence_length = 80, &block)
59
+ db = Wgit::Database.new
60
+ results = db.search(query, whole_sentence, limit, skip, &block)
61
+ Wgit::Utils.printf_search_results(results, query, false, sentence_length)
62
+ end
63
+
64
+ # Class which sets up a crawler and saves the indexed docs to a database.
65
+ class Indexer
66
+
67
+ # The crawler used to scrape the WWW.
68
+ attr_reader :crawler
69
+
70
+ # The database instance used to store Urls and Documents in.
71
+ attr_reader :db
72
+
73
+ # Initialize the Indexer.
74
+ #
75
+ # @param database [Wgit::Database] The database instance (already
76
+ # initialized with the correct connection details etc).
77
+ def initialize(database)
78
+ @crawler = Wgit::Crawler.new
79
+ @db = database
80
+ end
81
+
82
+ # Retrieves uncrawled url's from the database and recursively crawls each
83
+ # site storing their internal pages into the database and adding their
84
+ # external url's to be crawled at a later date. Puts out info on the crawl
85
+ # to STDOUT as it goes along.
86
+ #
87
+ # @param max_sites_to_crawl [Integer] The number of separate and whole
88
+ # websites to be crawled before the method exits. Defaults to -1 which
89
+ # means the crawl will occur until manually stopped (Ctrl+C etc).
90
+ # @param max_data_size [Integer] The maximum amount of bytes that will be
91
+ # scraped from the web (default is 1GB). Note, that this value is used to
92
+ # determine when to stop crawling; it's not a guarantee of the max data
93
+ # that will be obtained.
94
+ def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
95
+ if max_sites_to_crawl < 0
96
+ puts "Indexing until the database has been filled or it runs out of \
97
+ urls to crawl (which might be never)."
98
+ end
99
+ site_count = 0
100
+
101
+ while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
102
+ puts "Current database size: #{@db.size}"
103
+ @crawler.urls = @db.uncrawled_urls
104
+
105
+ if @crawler.urls.empty?
106
+ puts "No urls to crawl, exiting."
107
+ return
108
+ end
109
+ puts "Starting crawl loop for: #{@crawler.urls}"
110
+
111
+ docs_count = 0
112
+ urls_count = 0
113
+
114
+ @crawler.urls.each do |url|
115
+ unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
116
+ puts "Reached max number of sites to crawl or database \
117
+ capacity, exiting."
118
+ return
119
+ end
120
+ site_count += 1
121
+
122
+ url.crawled = true
123
+ raise unless @db.update(url) == 1
124
+
125
+ site_docs_count = 0
126
+ ext_links = @crawler.crawl_site(url) do |doc|
127
+ unless doc.empty?
128
+ if write_doc_to_db(doc)
129
+ docs_count += 1
130
+ site_docs_count += 1
131
+ end
132
+ end
133
+ end
134
+
135
+ urls_count += write_urls_to_db(ext_links)
136
+ puts "Crawled and saved #{site_docs_count} docs for the \
137
+ site: #{url}"
138
+ end
139
+
140
+ puts "Crawled and saved docs for #{docs_count} url(s) overall for \
141
+ this iteration."
142
+ puts "Found and saved #{urls_count} external url(s) for the next \
143
+ iteration."
144
+ end
145
+ end
146
+
147
+ # Crawls a single website's pages and stores them into the database.
148
+ # There is no max download limit so be careful which sites you index.
149
+ # Puts out info on the crawl to STDOUT as it goes along.
150
+ #
151
+ # @param url [Wgit::Url] The base Url of the website to crawl.
152
+ # @param insert_externals [Boolean] Whether or not to insert the website's
153
+ # external Url's into the database.
154
+ # @yield [doc] Given the Wgit::Document of each crawled web page, before it
155
+ # is inserted into the database allowing for prior manipulation. Return
156
+ # nil or false from the block to prevent the document from being saved
157
+ # into the database.
158
+ # @return [Integer] The total number of webpages/documents indexed.
159
+ def index_this_site(url, insert_externals = true)
160
+ total_pages_indexed = 0
161
+
162
+ ext_urls = @crawler.crawl_site(url) do |doc|
163
+ result = true
164
+ if block_given?
165
+ result = yield(doc)
166
+ end
167
+
168
+ if result
169
+ if write_doc_to_db(doc)
170
+ total_pages_indexed += 1
171
+ puts "Crawled and saved internal page: #{doc.url}"
172
+ end
173
+ end
174
+ end
175
+
176
+ url.crawled = true
177
+ if !@db.url?(url)
178
+ @db.insert(url)
179
+ else
180
+ @db.update(url)
181
+ end
182
+
183
+ if insert_externals
184
+ write_urls_to_db(ext_urls)
185
+ puts "Found and saved #{ext_urls.length} external url(s)"
186
+ end
187
+
188
+ puts "Crawled and saved #{total_pages_indexed} docs for the \
189
+ site: #{url}"
190
+
191
+ total_pages_indexed
192
+ end
193
+
194
+ private
195
+
196
+ # Keep crawling or not based on DB size and current loop iteration.
197
+ def keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
198
+ return false if @db.size >= max_data_size
199
+ # If max_sites_to_crawl is -1 for example then crawl away.
200
+ if max_sites_to_crawl < 0
201
+ true
202
+ else
203
+ site_count < max_sites_to_crawl
204
+ end
205
+ end
206
+
207
+ # The unique url index on the documents collection prevents duplicate
208
+ # inserts.
209
+ def write_doc_to_db(doc)
210
+ @db.insert(doc)
211
+ puts "Saved document for url: #{doc.url}"
212
+ true
213
+ rescue Mongo::Error::OperationFailure
214
+ puts "Document already exists: #{doc.url}"
215
+ false
216
+ end
217
+
218
+ # The unique url index on the urls collection prevents duplicate inserts.
219
+ def write_urls_to_db(urls)
220
+ count = 0
221
+ if urls.respond_to?(:each)
222
+ urls.each do |url|
223
+ begin
224
+ @db.insert(url)
225
+ count += 1
226
+ puts "Inserted url: #{url}"
227
+ rescue Mongo::Error::OperationFailure
228
+ puts "Url already exists: #{url}"
229
+ end
230
+ end
231
+ end
232
+ count
233
+ end
234
+ end
235
+ end
@@ -1,140 +1,218 @@
1
1
  require_relative 'utils'
2
+ require_relative 'assertable'
2
3
  require 'uri'
3
4
 
4
5
  module Wgit
5
6
 
6
- # @author Michael Telford
7
7
  # Class modeling a web based URL.
8
- # Can be an internal link e.g. "about.html"
9
- # or a full URL e.g. "http://www.google.co.uk".
8
+ # Can be an internal/relative link e.g. "about.html" or a full URL
9
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri'
10
+ # internally.
10
11
  class Url < String
11
- attr_accessor :crawled, :date_crawled
12
+ include Assertable
12
13
 
13
- def initialize(url_or_doc, crawled = false, date_crawled = nil)
14
- if (url_or_doc.is_a?(String))
15
- url = url_or_doc
16
- else
17
- # Init from a mongo collection document.
18
- url = url_or_doc[:url]
19
- crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
20
- date_crawled = url_or_doc[:date_crawled]
21
- end
22
- @uri = URI(url)
23
- @crawled = crawled
24
- @date_crawled = date_crawled
25
- super(url)
26
- end
27
-
28
- def self.validate(url)
29
- if Wgit::Url.relative_link?(url)
30
- raise "Invalid url (or a relative link): #{url}"
31
- end
32
- unless url.start_with?("http://") or url.start_with?("https://")
33
- raise "Invalid url (missing protocol prefix): #{url}"
34
- end
35
- if URI.regexp.match(url).nil?
36
- raise "Invalid url: #{url}"
37
- end
38
- end
39
-
40
- def self.valid?(url)
41
- Wgit::Url.validate(url)
42
- true
43
- rescue
44
- false
45
- end
46
-
47
- # Modifies the receiver url by prefixing it with a protocol.
48
- # Returns the url whether its been modified or not.
49
- def self.prefix_protocol(url, https = false)
50
- unless url.start_with?("http://") or url.start_with?("https://")
51
- if https
52
- url.replace("https://#{url}")
53
- else
54
- url.replace("http://#{url}")
55
- end
56
- end
57
- url
58
- end
59
-
60
- # URI.split("http://www.google.co.uk/about.html") returns the following:
61
- # array[2]: "www.google.co.uk", array[5]: "/about.html".
62
- # This means that all external links in a page are expected to have a
63
- # protocol prefix e.g. "http://", otherwise the link is treated as an
64
- # internal link (regardless of whether it is valid or not).
65
- def self.relative_link?(link)
66
- link_segs = URI.split(link)
67
- if not link_segs[2].nil? and not link_segs[2].empty?
68
- false
69
- elsif not link_segs[5].nil? and not link_segs[5].empty?
70
- true
71
- else
72
- raise "Invalid link: #{link}"
73
- end
74
- end
75
-
76
- def self.concat(host, link)
77
- url = host
78
- url.chop! if url.end_with?("/")
79
- link = link[1..-1] if link.start_with?("/")
80
- Wgit::Url.new(url + "/" + link)
81
- end
82
-
83
- def relative_link?
84
- Wgit::Url.relative_link?(self)
85
- end
86
-
87
- def valid?
88
- Wgit::Url.valid?(self)
14
+ # Whether or not the Url has been crawled or not.
15
+ attr_accessor :crawled
16
+
17
+ # The date which the Url was crawled.
18
+ attr_accessor :date_crawled
19
+
20
+ # Initializes a new instance of Wgit::Url which represents a web based
21
+ # HTTP URL.
22
+ #
23
+ # @param url_or_obj [String, Object#fetch#[]] Is either a String based
24
+ # URL or an object representing a Database record e.g. a MongoDB
25
+ # document/object.
26
+ # @param crawled [Boolean] Whether or not the HTML of the URL's web
27
+ # page has been scraped or not.
28
+ # @param date_crawled [Time] Should only be provided if crawled is
29
+ # true. A suitable object can be returned from
30
+ # Wgit::Utils.time_stamp.
31
+ # @raise [RuntimeError] If url_or_obj is an Object with missing methods.
32
+ def initialize(url_or_obj, crawled = false, date_crawled = nil)
33
+ # Init from a URL String.
34
+ if url_or_obj.is_a?(String)
35
+ url = url_or_obj.to_s
36
+ # Else init from a database object/document.
37
+ else
38
+ obj = url_or_obj
39
+ assert_respond_to(obj, [:fetch, :[]])
40
+
41
+ url = obj.fetch("url") # Should always be present.
42
+ crawled = obj.fetch("crawled", false)
43
+ date_crawled = obj["date_crawled"]
89
44
  end
90
-
91
- def concat(link)
92
- Wgit::Url.concat(self, link)
45
+
46
+ @uri = URI(url)
47
+ @crawled = crawled
48
+ @date_crawled = date_crawled
49
+
50
+ super(url)
51
+ end
52
+
53
+ # Raises an exception if url is not a valid HTTP URL.
54
+ #
55
+ # @param url [Wgit::Url, String] The Url to validate.
56
+ # @raise [RuntimeError] If url is invalid.
57
+ def self.validate(url)
58
+ if Wgit::Url.relative_link?(url)
59
+ raise "Invalid url (or a relative link): #{url}"
93
60
  end
94
-
95
- def crawled=(bool)
96
- @crawled = bool
97
- @date_crawled = bool ? Wgit::Utils.time_stamp : nil
61
+ unless url.start_with?("http://") or url.start_with?("https://")
62
+ raise "Invalid url (missing protocol prefix): #{url}"
98
63
  end
99
-
100
- def to_uri
101
- @uri
64
+ if URI.regexp.match(url).nil?
65
+ raise "Invalid url: #{url}"
102
66
  end
103
-
104
- def to_url
105
- self
67
+ end
68
+
69
+ # Determines if the Url is valid or not.
70
+ #
71
+ # @param url [Wgit::Url, String] The Url to validate.
72
+ # @return [Boolean] True if valid, otherwise false.
73
+ def self.valid?(url)
74
+ Wgit::Url.validate(url)
75
+ true
76
+ rescue
77
+ false
78
+ end
79
+
80
+ # Modifies the receiver url by prefixing it with a protocol.
81
+ # Returns the url whether its been modified or not.
82
+ # The default protocol prefix is http://.
83
+ #
84
+ # @param url [Wgit::Url, String] The url to be prefixed with a protocol.
85
+ # @param https [Boolean] Whether the protocol prefix is https or http.
86
+ # @return [Wgit::Url] The url with a protocol prefix.
87
+ def self.prefix_protocol(url, https = false)
88
+ unless url.start_with?("http://") or url.start_with?("https://")
89
+ if https
90
+ url.replace("https://#{url}")
91
+ else
92
+ url.replace("http://#{url}")
93
+ end
106
94
  end
107
-
108
- # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
109
- def to_host
110
- Wgit::Url.new(@uri.host)
95
+ url
96
+ end
97
+
98
+ # Returns if link is a relative or absolute Url. How it works:
99
+ # URI.split("http://www.google.co.uk/about.html") returns the following:
100
+ # array[2]: "www.google.co.uk", array[5]: "/about.html".
101
+ # This means that all external links in a page are expected to have a
102
+ # protocol prefix e.g. "http://", otherwise the link is treated as an
103
+ # internal link (regardless of whether it is valid or not).
104
+ #
105
+ # @param link [Wgit::Url, String] The url to test if relative or not.
106
+ # @return [Boolean] True if relative, false if absolute.
107
+ # @raise [RuntimeError] If the link is invalid.
108
+ def self.relative_link?(link)
109
+ link_segs = URI.split(link)
110
+ if not link_segs[2].nil? and not link_segs[2].empty?
111
+ false
112
+ elsif not link_segs[5].nil? and not link_segs[5].empty?
113
+ true
114
+ else
115
+ raise "Invalid link: #{link}"
111
116
  end
117
+ end
118
+
119
+ # Concats the host and link Strings and returns the result.
120
+ #
121
+ # @param host [Wgit::Url, String] The Url host.
122
+ # @param link [Wgit::Url, String] The link to add to the host prefix.
123
+ # @return [Wgit::Url] host + "/" + link
124
+ def self.concat(host, link)
125
+ url = host
126
+ url.chop! if url.end_with?("/")
127
+ link = link[1..-1] if link.start_with?("/")
128
+ Wgit::Url.new(url + "/" + link)
129
+ end
130
+
131
+ # Returns if self is a relative or absolute Url.
132
+ # @return [Boolean] True if relative, false if absolute.
133
+ # @raise [RuntimeError] If the link is invalid.
134
+ def relative_link?
135
+ Wgit::Url.relative_link?(self)
136
+ end
137
+
138
+ # Determines if self is a valid Url or not.
139
+ #
140
+ # @return [Boolean] True if valid, otherwise false.
141
+ def valid?
142
+ Wgit::Url.valid?(self)
143
+ end
144
+
145
+ # Concats self (Url) and the link.
146
+ #
147
+ # @param link [Wgit::Url, String] The link to concat with self.
148
+ # @return [Wgit::Url] self + "/" + link
149
+ def concat(link)
150
+ Wgit::Url.concat(self, link)
151
+ end
152
+
153
+ # Sets the @crawled instance var, also setting @date_crawled to the
154
+ # current time or nil (depending on the bool value).
155
+ #
156
+ # @param bool [Boolean] True if self has been crawled, false otherwise.
157
+ def crawled=(bool)
158
+ @crawled = bool
159
+ @date_crawled = bool ? Wgit::Utils.time_stamp : nil
160
+ end
161
+
162
+ # Returns the @uri instance var of this URL.
163
+ #
164
+ # @return [URI::HTTP, URI::HTTPS] The URI object of self.
165
+ def to_uri
166
+ @uri
167
+ end
112
168
 
113
- # URI.split("http://www.google.co.uk/about.html") returns the following:
114
- # array[0]: "http://", array[2]: "www.google.co.uk".
115
- # Returns array[0] + array[2] e.g. http://www.google.co.uk.
116
- def to_base
117
- if Wgit::Url.relative_link?(self)
118
- raise "A relative link doesn't have a base URL: #{self}"
119
- end
120
- url_segs = URI.split(self)
121
- if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
122
- raise "Both a protocol and host are needed: #{self}"
123
- end
124
- base = "#{url_segs[0]}://#{url_segs[2]}"
125
- Wgit::Url.new(base)
169
+ # Returns self.
170
+ #
171
+ # @return [Wgit::Url] This (self) Url.
172
+ def to_url
173
+ self
174
+ end
175
+
176
+ # Returns a new Wgit::Url containing just the host of this URL e.g.
177
+ # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
178
+ #
179
+ # @return [Wgit::Url] Containing just the host.
180
+ def to_host
181
+ Wgit::Url.new(@uri.host)
182
+ end
183
+
184
+ # Returns the base of this URL e.g. the protocol and host combined.
185
+ # How it works:
186
+ # URI.split("http://www.google.co.uk/about.html") returns the following:
187
+ # array[0]: "http://", array[2]: "www.google.co.uk", which we use.
188
+ #
189
+ # @return [Wgit::Url] Base of self (Url) e.g. http://www.google.co.uk.
190
+ def to_base
191
+ if Wgit::Url.relative_link?(self)
192
+ raise "A relative link doesn't have a base URL: #{self}"
126
193
  end
127
-
128
- def to_h
129
- ignore = [:@uri]
130
- h = Wgit::Utils.to_h(self, ignore)
131
- Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
194
+ url_segs = URI.split(self)
195
+ if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
196
+ raise "Both a protocol and host are needed: #{self}"
132
197
  end
133
-
134
- alias :to_hash :to_h
135
- alias :host :to_host
136
- alias :base :to_base
137
- alias :internal_link? :relative_link?
138
- alias :crawled? :crawled
198
+ base = "#{url_segs[0]}://#{url_segs[2]}"
199
+ Wgit::Url.new(base)
200
+ end
201
+
202
+ # Returns a Hash containing this Url's instance vars excluding @uri.
203
+ # Used when storing the URL in a Database e.g. MongoDB etc.
204
+ #
205
+ # @return [Hash] self's instance vars as a Hash.
206
+ def to_h
207
+ ignore = ["@uri"]
208
+ h = Wgit::Utils.to_h(self, ignore)
209
+ Hash[h.to_a.insert(0, ["url", self])] # Insert url at position 0.
210
+ end
211
+
212
+ alias :to_hash :to_h
213
+ alias :host :to_host
214
+ alias :base :to_base
215
+ alias :internal_link? :relative_link?
216
+ alias :crawled? :crawled
139
217
  end
140
218
  end