wgit 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,235 @@
1
+ require_relative 'crawler'
2
+ require_relative 'database/database'
3
+
4
+ module Wgit
5
+
6
+ # Convience method to index the World Wide Web using
7
+ # Wgit::Indexer#index_the_web.
8
+ #
9
+ # Retrieves uncrawled url's from the database and recursively crawls each
10
+ # site storing their internal pages into the database and adding their
11
+ # external url's to be crawled at a later date. Puts out info on the crawl
12
+ # to STDOUT as it goes along.
13
+ #
14
+ # @param max_sites_to_crawl [Integer] The number of separate and whole
15
+ # websites to be crawled before the method exits. Defaults to -1 which
16
+ # means the crawl will occur until manually stopped (Ctrl+C etc).
17
+ # @param max_data_size [Integer] The maximum amount of bytes that will be
18
+ # scraped from the web (default is 1GB). Note, that this value is used to
19
+ # determine when to stop crawling; it's not a guarantee of the max data
20
+ # that will be obtained.
21
+ def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
22
+ db = Wgit::Database.new
23
+ indexer = Wgit::Indexer.new(db)
24
+ indexer.index_the_web(max_sites_to_crawl, max_data_size)
25
+ end
26
+
27
+ # Convience method to index a single website using
28
+ # Wgit::Indexer#index_this_site.
29
+ #
30
+ # Crawls a single website's pages and stores them into the database.
31
+ # There is no max download limit so be careful which sites you index.
32
+ #
33
+ # @param url [Wgit::Url, String] The base Url of the website to crawl.
34
+ # @param insert_externals [Boolean] Whether or not to insert the website's
35
+ # external Url's into the database.
36
+ # @yield [doc] Given the Wgit::Document of each crawled web page, before it
37
+ # is inserted into the database allowing for prior manipulation.
38
+ # @return [Integer] The total number of pages crawled within the website.
39
+ def self.index_this_site(url, insert_externals = true, &block)
40
+ url = Wgit::Url.new url
41
+ db = Wgit::Database.new
42
+ indexer = Wgit::Indexer.new(db)
43
+ indexer.index_this_site(url, insert_externals, &block)
44
+ end
45
+
46
+ # Performs a search of the database's indexed documents and pretty prints
47
+ # the results. See Wgit::Database#search for details of the search.
48
+ #
49
+ # @param query [String] The text query to search with.
50
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
51
+ # for separately.
52
+ # @param limit [Integer] The max number of results to return.
53
+ # @param skip [Integer] The number of DB records to skip.
54
+ # @param sentence_length [Integer] The max length of each result's text
55
+ # snippet.
56
+ # @yield [doc] Given each search result (Wgit::Document).
57
+ def self.indexed_search(query, whole_sentence = false, limit = 10,
58
+ skip = 0, sentence_length = 80, &block)
59
+ db = Wgit::Database.new
60
+ results = db.search(query, whole_sentence, limit, skip, &block)
61
+ Wgit::Utils.printf_search_results(results, query, false, sentence_length)
62
+ end
63
+
64
+ # Class which sets up a crawler and saves the indexed docs to a database.
65
+ class Indexer
66
+
67
+ # The crawler used to scrape the WWW.
68
+ attr_reader :crawler
69
+
70
+ # The database instance used to store Urls and Documents in.
71
+ attr_reader :db
72
+
73
+ # Initialize the Indexer.
74
+ #
75
+ # @param database [Wgit::Database] The database instance (already
76
+ # initialized with the correct connection details etc).
77
+ def initialize(database)
78
+ @crawler = Wgit::Crawler.new
79
+ @db = database
80
+ end
81
+
82
+ # Retrieves uncrawled url's from the database and recursively crawls each
83
+ # site storing their internal pages into the database and adding their
84
+ # external url's to be crawled at a later date. Puts out info on the crawl
85
+ # to STDOUT as it goes along.
86
+ #
87
+ # @param max_sites_to_crawl [Integer] The number of separate and whole
88
+ # websites to be crawled before the method exits. Defaults to -1 which
89
+ # means the crawl will occur until manually stopped (Ctrl+C etc).
90
+ # @param max_data_size [Integer] The maximum amount of bytes that will be
91
+ # scraped from the web (default is 1GB). Note, that this value is used to
92
+ # determine when to stop crawling; it's not a guarantee of the max data
93
+ # that will be obtained.
94
+ def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
95
+ if max_sites_to_crawl < 0
96
+ puts "Indexing until the database has been filled or it runs out of \
97
+ urls to crawl (which might be never)."
98
+ end
99
+ site_count = 0
100
+
101
+ while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
102
+ puts "Current database size: #{@db.size}"
103
+ @crawler.urls = @db.uncrawled_urls
104
+
105
+ if @crawler.urls.empty?
106
+ puts "No urls to crawl, exiting."
107
+ return
108
+ end
109
+ puts "Starting crawl loop for: #{@crawler.urls}"
110
+
111
+ docs_count = 0
112
+ urls_count = 0
113
+
114
+ @crawler.urls.each do |url|
115
+ unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
116
+ puts "Reached max number of sites to crawl or database \
117
+ capacity, exiting."
118
+ return
119
+ end
120
+ site_count += 1
121
+
122
+ url.crawled = true
123
+ raise unless @db.update(url) == 1
124
+
125
+ site_docs_count = 0
126
+ ext_links = @crawler.crawl_site(url) do |doc|
127
+ unless doc.empty?
128
+ if write_doc_to_db(doc)
129
+ docs_count += 1
130
+ site_docs_count += 1
131
+ end
132
+ end
133
+ end
134
+
135
+ urls_count += write_urls_to_db(ext_links)
136
+ puts "Crawled and saved #{site_docs_count} docs for the \
137
+ site: #{url}"
138
+ end
139
+
140
+ puts "Crawled and saved docs for #{docs_count} url(s) overall for \
141
+ this iteration."
142
+ puts "Found and saved #{urls_count} external url(s) for the next \
143
+ iteration."
144
+ end
145
+ end
146
+
147
+ # Crawls a single website's pages and stores them into the database.
148
+ # There is no max download limit so be careful which sites you index.
149
+ # Puts out info on the crawl to STDOUT as it goes along.
150
+ #
151
+ # @param url [Wgit::Url] The base Url of the website to crawl.
152
+ # @param insert_externals [Boolean] Whether or not to insert the website's
153
+ # external Url's into the database.
154
+ # @yield [doc] Given the Wgit::Document of each crawled web page, before it
155
+ # is inserted into the database allowing for prior manipulation. Return
156
+ # nil or false from the block to prevent the document from being saved
157
+ # into the database.
158
+ # @return [Integer] The total number of webpages/documents indexed.
159
+ def index_this_site(url, insert_externals = true)
160
+ total_pages_indexed = 0
161
+
162
+ ext_urls = @crawler.crawl_site(url) do |doc|
163
+ result = true
164
+ if block_given?
165
+ result = yield(doc)
166
+ end
167
+
168
+ if result
169
+ if write_doc_to_db(doc)
170
+ total_pages_indexed += 1
171
+ puts "Crawled and saved internal page: #{doc.url}"
172
+ end
173
+ end
174
+ end
175
+
176
+ url.crawled = true
177
+ if !@db.url?(url)
178
+ @db.insert(url)
179
+ else
180
+ @db.update(url)
181
+ end
182
+
183
+ if insert_externals
184
+ write_urls_to_db(ext_urls)
185
+ puts "Found and saved #{ext_urls.length} external url(s)"
186
+ end
187
+
188
+ puts "Crawled and saved #{total_pages_indexed} docs for the \
189
+ site: #{url}"
190
+
191
+ total_pages_indexed
192
+ end
193
+
194
+ private
195
+
196
+ # Keep crawling or not based on DB size and current loop iteration.
197
+ def keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
198
+ return false if @db.size >= max_data_size
199
+ # If max_sites_to_crawl is -1 for example then crawl away.
200
+ if max_sites_to_crawl < 0
201
+ true
202
+ else
203
+ site_count < max_sites_to_crawl
204
+ end
205
+ end
206
+
207
+ # The unique url index on the documents collection prevents duplicate
208
+ # inserts.
209
+ def write_doc_to_db(doc)
210
+ @db.insert(doc)
211
+ puts "Saved document for url: #{doc.url}"
212
+ true
213
+ rescue Mongo::Error::OperationFailure
214
+ puts "Document already exists: #{doc.url}"
215
+ false
216
+ end
217
+
218
+ # The unique url index on the urls collection prevents duplicate inserts.
219
+ def write_urls_to_db(urls)
220
+ count = 0
221
+ if urls.respond_to?(:each)
222
+ urls.each do |url|
223
+ begin
224
+ @db.insert(url)
225
+ count += 1
226
+ puts "Inserted url: #{url}"
227
+ rescue Mongo::Error::OperationFailure
228
+ puts "Url already exists: #{url}"
229
+ end
230
+ end
231
+ end
232
+ count
233
+ end
234
+ end
235
+ end
@@ -1,140 +1,218 @@
1
1
  require_relative 'utils'
2
+ require_relative 'assertable'
2
3
  require 'uri'
3
4
 
4
5
  module Wgit
5
6
 
6
- # @author Michael Telford
7
7
  # Class modeling a web based URL.
8
- # Can be an internal link e.g. "about.html"
9
- # or a full URL e.g. "http://www.google.co.uk".
8
+ # Can be an internal/relative link e.g. "about.html" or a full URL
9
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri'
10
+ # internally.
10
11
  class Url < String
11
- attr_accessor :crawled, :date_crawled
12
+ include Assertable
12
13
 
13
- def initialize(url_or_doc, crawled = false, date_crawled = nil)
14
- if (url_or_doc.is_a?(String))
15
- url = url_or_doc
16
- else
17
- # Init from a mongo collection document.
18
- url = url_or_doc[:url]
19
- crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
20
- date_crawled = url_or_doc[:date_crawled]
21
- end
22
- @uri = URI(url)
23
- @crawled = crawled
24
- @date_crawled = date_crawled
25
- super(url)
26
- end
27
-
28
- def self.validate(url)
29
- if Wgit::Url.relative_link?(url)
30
- raise "Invalid url (or a relative link): #{url}"
31
- end
32
- unless url.start_with?("http://") or url.start_with?("https://")
33
- raise "Invalid url (missing protocol prefix): #{url}"
34
- end
35
- if URI.regexp.match(url).nil?
36
- raise "Invalid url: #{url}"
37
- end
38
- end
39
-
40
- def self.valid?(url)
41
- Wgit::Url.validate(url)
42
- true
43
- rescue
44
- false
45
- end
46
-
47
- # Modifies the receiver url by prefixing it with a protocol.
48
- # Returns the url whether its been modified or not.
49
- def self.prefix_protocol(url, https = false)
50
- unless url.start_with?("http://") or url.start_with?("https://")
51
- if https
52
- url.replace("https://#{url}")
53
- else
54
- url.replace("http://#{url}")
55
- end
56
- end
57
- url
58
- end
59
-
60
- # URI.split("http://www.google.co.uk/about.html") returns the following:
61
- # array[2]: "www.google.co.uk", array[5]: "/about.html".
62
- # This means that all external links in a page are expected to have a
63
- # protocol prefix e.g. "http://", otherwise the link is treated as an
64
- # internal link (regardless of whether it is valid or not).
65
- def self.relative_link?(link)
66
- link_segs = URI.split(link)
67
- if not link_segs[2].nil? and not link_segs[2].empty?
68
- false
69
- elsif not link_segs[5].nil? and not link_segs[5].empty?
70
- true
71
- else
72
- raise "Invalid link: #{link}"
73
- end
74
- end
75
-
76
- def self.concat(host, link)
77
- url = host
78
- url.chop! if url.end_with?("/")
79
- link = link[1..-1] if link.start_with?("/")
80
- Wgit::Url.new(url + "/" + link)
81
- end
82
-
83
- def relative_link?
84
- Wgit::Url.relative_link?(self)
85
- end
86
-
87
- def valid?
88
- Wgit::Url.valid?(self)
14
+ # Whether or not the Url has been crawled or not.
15
+ attr_accessor :crawled
16
+
17
+ # The date which the Url was crawled.
18
+ attr_accessor :date_crawled
19
+
20
+ # Initializes a new instance of Wgit::Url which represents a web based
21
+ # HTTP URL.
22
+ #
23
+ # @param url_or_obj [String, Object#fetch#[]] Is either a String based
24
+ # URL or an object representing a Database record e.g. a MongoDB
25
+ # document/object.
26
+ # @param crawled [Boolean] Whether or not the HTML of the URL's web
27
+ # page has been scraped or not.
28
+ # @param date_crawled [Time] Should only be provided if crawled is
29
+ # true. A suitable object can be returned from
30
+ # Wgit::Utils.time_stamp.
31
+ # @raise [RuntimeError] If url_or_obj is an Object with missing methods.
32
+ def initialize(url_or_obj, crawled = false, date_crawled = nil)
33
+ # Init from a URL String.
34
+ if url_or_obj.is_a?(String)
35
+ url = url_or_obj.to_s
36
+ # Else init from a database object/document.
37
+ else
38
+ obj = url_or_obj
39
+ assert_respond_to(obj, [:fetch, :[]])
40
+
41
+ url = obj.fetch("url") # Should always be present.
42
+ crawled = obj.fetch("crawled", false)
43
+ date_crawled = obj["date_crawled"]
89
44
  end
90
-
91
- def concat(link)
92
- Wgit::Url.concat(self, link)
45
+
46
+ @uri = URI(url)
47
+ @crawled = crawled
48
+ @date_crawled = date_crawled
49
+
50
+ super(url)
51
+ end
52
+
53
+ # Raises an exception if url is not a valid HTTP URL.
54
+ #
55
+ # @param url [Wgit::Url, String] The Url to validate.
56
+ # @raise [RuntimeError] If url is invalid.
57
+ def self.validate(url)
58
+ if Wgit::Url.relative_link?(url)
59
+ raise "Invalid url (or a relative link): #{url}"
93
60
  end
94
-
95
- def crawled=(bool)
96
- @crawled = bool
97
- @date_crawled = bool ? Wgit::Utils.time_stamp : nil
61
+ unless url.start_with?("http://") or url.start_with?("https://")
62
+ raise "Invalid url (missing protocol prefix): #{url}"
98
63
  end
99
-
100
- def to_uri
101
- @uri
64
+ if URI.regexp.match(url).nil?
65
+ raise "Invalid url: #{url}"
102
66
  end
103
-
104
- def to_url
105
- self
67
+ end
68
+
69
+ # Determines if the Url is valid or not.
70
+ #
71
+ # @param url [Wgit::Url, String] The Url to validate.
72
+ # @return [Boolean] True if valid, otherwise false.
73
+ def self.valid?(url)
74
+ Wgit::Url.validate(url)
75
+ true
76
+ rescue
77
+ false
78
+ end
79
+
80
+ # Modifies the receiver url by prefixing it with a protocol.
81
+ # Returns the url whether its been modified or not.
82
+ # The default protocol prefix is http://.
83
+ #
84
+ # @param url [Wgit::Url, String] The url to be prefixed with a protocol.
85
+ # @param https [Boolean] Whether the protocol prefix is https or http.
86
+ # @return [Wgit::Url] The url with a protocol prefix.
87
+ def self.prefix_protocol(url, https = false)
88
+ unless url.start_with?("http://") or url.start_with?("https://")
89
+ if https
90
+ url.replace("https://#{url}")
91
+ else
92
+ url.replace("http://#{url}")
93
+ end
106
94
  end
107
-
108
- # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
109
- def to_host
110
- Wgit::Url.new(@uri.host)
95
+ url
96
+ end
97
+
98
+ # Returns if link is a relative or absolute Url. How it works:
99
+ # URI.split("http://www.google.co.uk/about.html") returns the following:
100
+ # array[2]: "www.google.co.uk", array[5]: "/about.html".
101
+ # This means that all external links in a page are expected to have a
102
+ # protocol prefix e.g. "http://", otherwise the link is treated as an
103
+ # internal link (regardless of whether it is valid or not).
104
+ #
105
+ # @param link [Wgit::Url, String] The url to test if relative or not.
106
+ # @return [Boolean] True if relative, false if absolute.
107
+ # @raise [RuntimeError] If the link is invalid.
108
+ def self.relative_link?(link)
109
+ link_segs = URI.split(link)
110
+ if not link_segs[2].nil? and not link_segs[2].empty?
111
+ false
112
+ elsif not link_segs[5].nil? and not link_segs[5].empty?
113
+ true
114
+ else
115
+ raise "Invalid link: #{link}"
111
116
  end
117
+ end
118
+
119
+ # Concats the host and link Strings and returns the result.
120
+ #
121
+ # @param host [Wgit::Url, String] The Url host.
122
+ # @param link [Wgit::Url, String] The link to add to the host prefix.
123
+ # @return [Wgit::Url] host + "/" + link
124
+ def self.concat(host, link)
125
+ url = host
126
+ url.chop! if url.end_with?("/")
127
+ link = link[1..-1] if link.start_with?("/")
128
+ Wgit::Url.new(url + "/" + link)
129
+ end
130
+
131
+ # Returns if self is a relative or absolute Url.
132
+ # @return [Boolean] True if relative, false if absolute.
133
+ # @raise [RuntimeError] If the link is invalid.
134
+ def relative_link?
135
+ Wgit::Url.relative_link?(self)
136
+ end
137
+
138
+ # Determines if self is a valid Url or not.
139
+ #
140
+ # @return [Boolean] True if valid, otherwise false.
141
+ def valid?
142
+ Wgit::Url.valid?(self)
143
+ end
144
+
145
+ # Concats self (Url) and the link.
146
+ #
147
+ # @param link [Wgit::Url, String] The link to concat with self.
148
+ # @return [Wgit::Url] self + "/" + link
149
+ def concat(link)
150
+ Wgit::Url.concat(self, link)
151
+ end
152
+
153
+ # Sets the @crawled instance var, also setting @date_crawled to the
154
+ # current time or nil (depending on the bool value).
155
+ #
156
+ # @param bool [Boolean] True if self has been crawled, false otherwise.
157
+ def crawled=(bool)
158
+ @crawled = bool
159
+ @date_crawled = bool ? Wgit::Utils.time_stamp : nil
160
+ end
161
+
162
+ # Returns the @uri instance var of this URL.
163
+ #
164
+ # @return [URI::HTTP, URI::HTTPS] The URI object of self.
165
+ def to_uri
166
+ @uri
167
+ end
112
168
 
113
- # URI.split("http://www.google.co.uk/about.html") returns the following:
114
- # array[0]: "http://", array[2]: "www.google.co.uk".
115
- # Returns array[0] + array[2] e.g. http://www.google.co.uk.
116
- def to_base
117
- if Wgit::Url.relative_link?(self)
118
- raise "A relative link doesn't have a base URL: #{self}"
119
- end
120
- url_segs = URI.split(self)
121
- if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
122
- raise "Both a protocol and host are needed: #{self}"
123
- end
124
- base = "#{url_segs[0]}://#{url_segs[2]}"
125
- Wgit::Url.new(base)
169
+ # Returns self.
170
+ #
171
+ # @return [Wgit::Url] This (self) Url.
172
+ def to_url
173
+ self
174
+ end
175
+
176
+ # Returns a new Wgit::Url containing just the host of this URL e.g.
177
+ # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
178
+ #
179
+ # @return [Wgit::Url] Containing just the host.
180
+ def to_host
181
+ Wgit::Url.new(@uri.host)
182
+ end
183
+
184
+ # Returns the base of this URL e.g. the protocol and host combined.
185
+ # How it works:
186
+ # URI.split("http://www.google.co.uk/about.html") returns the following:
187
+ # array[0]: "http://", array[2]: "www.google.co.uk", which we use.
188
+ #
189
+ # @return [Wgit::Url] Base of self (Url) e.g. http://www.google.co.uk.
190
+ def to_base
191
+ if Wgit::Url.relative_link?(self)
192
+ raise "A relative link doesn't have a base URL: #{self}"
126
193
  end
127
-
128
- def to_h
129
- ignore = [:@uri]
130
- h = Wgit::Utils.to_h(self, ignore)
131
- Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
194
+ url_segs = URI.split(self)
195
+ if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
196
+ raise "Both a protocol and host are needed: #{self}"
132
197
  end
133
-
134
- alias :to_hash :to_h
135
- alias :host :to_host
136
- alias :base :to_base
137
- alias :internal_link? :relative_link?
138
- alias :crawled? :crawled
198
+ base = "#{url_segs[0]}://#{url_segs[2]}"
199
+ Wgit::Url.new(base)
200
+ end
201
+
202
+ # Returns a Hash containing this Url's instance vars excluding @uri.
203
+ # Used when storing the URL in a Database e.g. MongoDB etc.
204
+ #
205
+ # @return [Hash] self's instance vars as a Hash.
206
+ def to_h
207
+ ignore = ["@uri"]
208
+ h = Wgit::Utils.to_h(self, ignore)
209
+ Hash[h.to_a.insert(0, ["url", self])] # Insert url at position 0.
210
+ end
211
+
212
+ alias :to_hash :to_h
213
+ alias :host :to_host
214
+ alias :base :to_base
215
+ alias :internal_link? :relative_link?
216
+ alias :crawled? :crawled
139
217
  end
140
218
  end