wgit 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 82f33e00a273c6cdeb3ba9c171110d849fff2428
4
+ data.tar.gz: 14c63f826d1d21811b14e9f3a2bca750b3f4afa3
5
+ SHA512:
6
+ metadata.gz: 7c42b925f72d9e7cceba79d9aee764f97b6537c0005038501a1f75c36b1bcd3b6036cfb9b62fcf01fd435e0348c1e8c00c445a291051c068fa58184de2c9590a
7
+ data.tar.gz: a2a756c3be7b9b214921bfdac5846a2250e452265285cb9c3b812d2eaefc2ab969b608cd1841f34507a6ef184f20ba7c98658daf0135fb85eead88de0356320f
data/lib/wgit.rb ADDED
@@ -0,0 +1,11 @@
1
+ require_relative 'wgit/version'
2
+ require_relative 'wgit/crawler'
3
+ require_relative 'wgit/web_crawler'
4
+ require_relative 'wgit/url'
5
+ require_relative 'wgit/document'
6
+ require_relative 'wgit/utils'
7
+ require_relative 'wgit/assertable'
8
+ require_relative 'wgit/database/database'
9
+ require_relative 'wgit/database/model'
10
+ require_relative 'wgit/database/mongo_connection_details'
11
+ #require_relative 'wgit/core_ext'
@@ -0,0 +1,69 @@
1
+
2
+ module Wgit
3
+
4
+ # @author Michael Telford
5
+ # Module containing assert methods including type checking which can be used
6
+ # for asserting the integrity of method definitions etc.
7
+ module Assertable
8
+ DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
9
+ WRONG_METHOD_MSG = "arr must be Enumerable, use a different method"
10
+ DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
11
+
12
+ # obj.instance_of? must return true for one of the types listed in
13
+ # type_or_types or an exception is thrown using msg if provided.
14
+ # type_or_types can be a single Class or an Enumerable of Class objects,
15
+ # Strings and Symbols will not work.
16
+ def assert_types(obj, type_or_types, msg = nil)
17
+ msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
18
+ if type_or_types.respond_to?(:any?)
19
+ match = type_or_types.any? { |type| obj.instance_of?(type) }
20
+ else
21
+ match = obj.instance_of?(type_or_types)
22
+ end
23
+ raise msg unless match
24
+ obj
25
+ end
26
+
27
+ # Each object within arr must match one of the types listed in
28
+ # type_or_types or an exception is thrown using msg if provided.
29
+ # type_or_types can be a single Class or an Enumerable of Class objects,
30
+ # Strings and Symbols will not work.
31
+ def assert_arr_types(arr, type_or_types, msg = nil)
32
+ raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
33
+ arr.each do |obj|
34
+ assert_types(obj, type_or_types, msg)
35
+ end
36
+ end
37
+
38
+ # The obj_or_objs must respond_to? all of the given methods or an
39
+ # Exception is raised using msg or a default message.
40
+ # Returns obj_or_objs on sucessful assertion.
41
+ def assert_respond_to(obj_or_objs, methods, msg = nil)
42
+ if obj_or_objs.respond_to?(:each)
43
+ obj_or_objs.each do |obj|
44
+ _assert_respond_to(obj, methods, msg)
45
+ end
46
+ else
47
+ _assert_respond_to(obj_or_objs, methods, msg)
48
+ end
49
+ obj_or_objs
50
+ end
51
+
52
+ private
53
+
54
+ def _assert_respond_to(obj, methods, msg = nil)
55
+ msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
56
+ match = methods.all? { |method| obj.respond_to?(method) }
57
+ raise msg unless match
58
+ obj
59
+ end
60
+
61
+ alias :assert_type :assert_types
62
+ alias :type :assert_types
63
+ alias :types :assert_types
64
+ alias :assert_arr_type :assert_arr_types
65
+ alias :arr_type :assert_arr_types
66
+ alias :arr_types :assert_arr_types
67
+ alias :respond_to :assert_respond_to
68
+ end
69
+ end
@@ -0,0 +1,40 @@
1
+ require_relative 'url'
2
+
3
+ # @author Michael Telford
4
+ # Script which extends Ruby's core functionality when parsed.
5
+ # Needs to be required separately using `require 'wgit/core_ext'`.
6
+
7
+ class String
8
+ # Converts a String into a Wgit::Url object.
9
+ def to_url
10
+ Wgit::Url.new(self)
11
+ end
12
+ end
13
+
14
+ module Enumerable
15
+ # Converts each String instance into a Wgit::Url object and returns the new
16
+ # array.
17
+ def to_urls
18
+ map do |element|
19
+ process_url_element(element)
20
+ end
21
+ end
22
+
23
+ # Converts each String instance into a Wgit::Url object and returns the
24
+ # updated array.
25
+ def to_urls!
26
+ map! do |element|
27
+ process_url_element(element)
28
+ end
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def process_url_element(element)
35
+ if element.is_a? String
36
+ element.to_url
37
+ else
38
+ element
39
+ end
40
+ end
@@ -0,0 +1,132 @@
1
+ require_relative 'url'
2
+ require_relative 'document'
3
+ require_relative 'utils'
4
+ require_relative 'assertable'
5
+ require 'net/http' # requires 'uri'
6
+
7
+ module Wgit
8
+
9
+ # @author Michael Telford
10
+ # Crawler class provides a means of crawling web URL's.
11
+ # Note that any redirects will not be followed for during crawling
12
+ # functionality.
13
+ class Crawler
14
+ include Assertable
15
+
16
+ attr_reader :urls, :docs
17
+
18
+ def initialize(*urls)
19
+ self.urls = urls unless urls.nil?
20
+ @docs = []
21
+ end
22
+
23
+ def urls=(urls)
24
+ @urls = []
25
+ Wgit::Utils.each(urls) { |url| add_url(url) }
26
+ end
27
+
28
+ def [](*urls)
29
+ self.urls = urls unless urls.nil?
30
+ end
31
+
32
+ def <<(url)
33
+ add_url(url)
34
+ end
35
+
36
+ # Crawls individual urls, not entire sites.
37
+ # Returns the last crawled doc.
38
+ # Yields each doc to the provided block or adds each doc to @docs
39
+ # which can be accessed by Crawler#docs after the method returns.
40
+ def crawl_urls(urls = @urls, &block)
41
+ raise "No urls to crawl" unless urls
42
+ @docs = []
43
+ doc = nil
44
+ Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
45
+ doc ? doc : @docs.last
46
+ end
47
+
48
+ # Crawl the url and return the response document or nil.
49
+ # Also yield(doc) if a block is provided. The doc is passed to the block
50
+ # regardless of the crawl success so the doc.url can be used if needed.
51
+ def crawl_url(url = @urls.first, &block)
52
+ assert_type(url, Url)
53
+ markup = fetch(url)
54
+ url.crawled = true
55
+ doc = Wgit::Document.new(url, markup)
56
+ block.call(doc) if block_given?
57
+ doc.empty? ? nil : doc
58
+ end
59
+
60
+ # Crawls an entire site by recursively going through its internal_links.
61
+ # Also yield(doc) for each crawled doc if a block is provided.
62
+ # A block is the only way to interact with the crawled docs.
63
+ # Returns a unique array of external urls collected from the site
64
+ # or nil if the base_url could not be crawled successfully.
65
+ def crawl_site(base_url = @urls.first, &block)
66
+ assert_type(base_url, Url)
67
+
68
+ doc = crawl_url(base_url, &block)
69
+ return nil if doc.nil?
70
+
71
+ crawled_urls = []
72
+ external_urls = doc.external_links
73
+ internal_urls = doc.internal_links
74
+
75
+ return doc.external_links.uniq if internal_urls.empty?
76
+
77
+ loop do
78
+ internal_urls.uniq! unless internal_urls.uniq.nil?
79
+
80
+ links = internal_urls - crawled_urls
81
+ break if links.empty?
82
+
83
+ links.each do |link|
84
+ doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
85
+ crawled_urls << link
86
+ next if doc.nil?
87
+ internal_urls.concat(doc.internal_links)
88
+ external_urls.concat(doc.external_links)
89
+ end
90
+ end
91
+
92
+ external_urls.uniq
93
+ end
94
+
95
+ private
96
+
97
+ # Add the document to the @docs array for later processing
98
+ # or let the block process it here and now.
99
+ def handle_crawl_block(url, &block)
100
+ if not block_given?
101
+ @docs << crawl_url(url)
102
+ nil
103
+ else
104
+ crawl_url(url, &block)
105
+ end
106
+ end
107
+
108
+ # The fetch method performs a HTTP GET to obtain the HTML document.
109
+ # Invalid urls or any HTTP response that doesn't return a HTML body
110
+ # will be ignored and nil will be returned. This means that redirects
111
+ # etc. will not be followed.
112
+ def fetch(url)
113
+ raise unless url.respond_to?(:to_uri)
114
+ res = Net::HTTP.get_response(url.to_uri)
115
+ res.body.empty? ? nil : res.body
116
+ rescue
117
+ nil
118
+ end
119
+
120
+ def add_url(url)
121
+ @urls = [] if @urls.nil?
122
+ if url.instance_of?(Url)
123
+ @urls << url
124
+ else
125
+ @urls << Wgit::Url.new(url)
126
+ end
127
+ end
128
+
129
+ alias :crawl :crawl_urls
130
+ alias :crawl_r :crawl_site
131
+ end
132
+ end
@@ -0,0 +1,269 @@
1
+ require_relative '../document'
2
+ require_relative '../url'
3
+ require_relative '../utils'
4
+ require_relative '../assertable'
5
+ require_relative 'mongo_connection_details'
6
+ require_relative 'model'
7
+ require 'mongo'
8
+
9
+ module Wgit
10
+
11
+ # @author Michael Telford
12
+ # Class modeling a DB connection and CRUD operations for the Url and
13
+ # Document collections.
14
+ # The most common methods are: insert, update, urls, search, stats, size.
15
+ class Database
16
+ include Assertable
17
+
18
+ # Is relative to the root project folder, not this file.
19
+ LOG_FILE_PATH = "misc/mongo_log.txt"
20
+
21
+ def initialize
22
+ conn_details = Wgit::CONNECTION_DETAILS
23
+ if conn_details.empty?
24
+ raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
25
+ :port, :db, :uname, :pword for a database connection to be established."
26
+ end
27
+
28
+ logger = Logger.new(LOG_FILE_PATH)
29
+ address = "#{conn_details[:host]}:#{conn_details[:port]}"
30
+ @@client = Mongo::Client.new([address],
31
+ :database => conn_details[:db],
32
+ :user => conn_details[:uname],
33
+ :password => conn_details[:pword],
34
+ :logger => logger,
35
+ :truncate_logs => false)
36
+ end
37
+
38
+ ### Create Data ###
39
+
40
+ def insert(data)
41
+ if data.is_a?(Url)
42
+ insert_urls(data)
43
+ elsif data.is_a?(Document)
44
+ insert_docs(data)
45
+ elsif data.respond_to?(:first)
46
+ if data.first.is_a?(Url)
47
+ insert_urls(data)
48
+ else
49
+ insert_docs(data)
50
+ end
51
+ else
52
+ raise "data is not in the correct format (all Url's or Document's)"
53
+ end
54
+ end
55
+
56
+ def insert_urls(url_or_urls)
57
+ unless url_or_urls.respond_to?(:map)
58
+ assert_type(url_or_urls, Url)
59
+ url_or_urls = Wgit::Model.url(url_or_urls)
60
+ else
61
+ assert_arr_types(url_or_urls, Url)
62
+ url_or_urls = url_or_urls.map do |url|
63
+ Wgit::Model.url(url)
64
+ end
65
+ end
66
+ create(:urls, url_or_urls)
67
+ end
68
+
69
+ def insert_docs(doc_or_docs)
70
+ unless doc_or_docs.respond_to?(:map)
71
+ assert_type(doc_or_docs, [Document, Hash])
72
+ unless doc_or_docs.is_a?(Hash)
73
+ doc_or_docs = Wgit::Model.document(doc_or_docs)
74
+ end
75
+ else
76
+ assert_arr_types(doc_or_docs, [Document, Hash])
77
+ doc_or_docs = doc_or_docs.map do |doc|
78
+ Wgit::Model.document(doc) unless doc.is_a?(Hash)
79
+ end
80
+ end
81
+ create(:documents, doc_or_docs)
82
+ end
83
+
84
+ ### Retrieve Data ###
85
+
86
+ # A crawled parameter value of nil (the default) returns all urls.
87
+ # A limit of 0 means all urls are returned.
88
+ # All urls are sorted by date_added ascending, in other words the first
89
+ # url in the results is the first added.
90
+ def urls(crawled = nil, limit = 0, skip = 0, &block)
91
+ crawled.nil? ? query = {} : query = { :crawled => crawled }
92
+
93
+ sort = { :date_added => 1 }
94
+ results = retrieve(:urls, query, sort, {}, limit, skip)
95
+ return [] if results.count < 1
96
+
97
+ # results.respond_to? :map! is false so we use map and overwrite the var.
98
+ results = results.map { |url_doc| Wgit::Url.new(url_doc) }
99
+ return results unless block_given?
100
+ results.each { |url| block.call(url) }
101
+ end
102
+
103
+ def crawled_urls(limit = 0, skip = 0, &block)
104
+ urls(true, limit, skip, &block)
105
+ end
106
+
107
+ def uncrawled_urls(limit = 0, skip = 0, &block)
108
+ urls(false, limit, skip, &block)
109
+ end
110
+
111
+ # Currently all searches are case insensitive.
112
+ #
113
+ # Searches against the indexed docs in the DB for the given text.
114
+ # The searched fields are decided by the text index setup against the
115
+ # documents collection. Currently we search against the following fields:
116
+ # "author", "keywords", "title" and "text".
117
+ #
118
+ # The MongoDB search ranks/sorts the results in order (highest first) based
119
+ # upon each documents textScore which records the number of text hits. We
120
+ # then store this textScore in each Document object for use elsewhere if
121
+ # needed.
122
+ #
123
+ # @param text [String] the value to search the data against.
124
+ # @param whole_sentence [Boolean] whether multiple words should be
125
+ # searched for separately.
126
+ # @param limit [Fixnum] the max length/count of the results array.
127
+ # @param skip [Fixnum] the number of results to skip, starting with the
128
+ # most relevant based upon the textScore of the search.
129
+ # @param block [Block] a block which if provided is passed to each result.
130
+ #
131
+ # @return [Array] of Document objects representing the search results.
132
+ def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
133
+ text.strip!
134
+ text.replace("\"" + text + "\"") if whole_sentence
135
+
136
+ # The textScore sorts based on the most search hits.
137
+ # We use the textScore hash as a sort and a projection below.
138
+ # :$caseSensitive => case_sensitive, # 3.2+ only.
139
+ sort_proj = { :score => { :$meta => "textScore" } }
140
+ query = { :$text => { :$search => text } }
141
+ results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
142
+
143
+ return [] if results.count < 1
144
+ # results.respond_to? :map! is false so we use map and overwrite the var.
145
+ results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
146
+ return results unless block_given?
147
+ results.each { |doc| block.call(doc) }
148
+ end
149
+
150
+ # Performs a search and pretty prints the results.
151
+ def search_p(text, whole_sentence = false, limit = 10,
152
+ skip = 0, sentence_length = 80, &block)
153
+ results = search(text, whole_sentence, limit, skip, &block)
154
+ Wgit::Utils.printf_search_results(results, text, false, sentence_length)
155
+ end
156
+
157
+ # Returns a Mongo object which can be used like a Hash to retrieve values.
158
+ def stats
159
+ @@client.command(:dbStats => 0).documents[0]
160
+ end
161
+
162
+ def size
163
+ stats[:dataSize]
164
+ end
165
+
166
+ ### Update Data ###
167
+
168
+ def update(data)
169
+ if data.is_a?(Url)
170
+ update_url(data)
171
+ elsif data.is_a?(Document)
172
+ update_doc(data)
173
+ else
174
+ raise "data is not in the correct format (all Url's or Document's)"
175
+ end
176
+ end
177
+
178
+ def update_url(url)
179
+ assert_type(url, Url)
180
+ selection = { :url => url }
181
+ url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
182
+ update = { "$set" => url_hash }
183
+ _update(true, :urls, selection, update)
184
+ end
185
+
186
+ def update_doc(doc)
187
+ assert_type(doc, Document)
188
+ selection = { :url => doc.url }
189
+ doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
190
+ update = { "$set" => doc_hash }
191
+ _update(true, :documents, selection, update)
192
+ end
193
+
194
+ private
195
+
196
+ def write_succeeded?(result, count = 1, multi = false)
197
+ case result.class.to_s
198
+ # Single create result.
199
+ when "Mongo::Operation::Write::Insert::Result"
200
+ result.documents.first[:err].nil?
201
+ # Multiple create result.
202
+ when "Mongo::BulkWrite::Result"
203
+ result.inserted_count == count
204
+ # Single and multiple update result.
205
+ when "Mongo::Operation::Write::Update::Result", # MongoDB 3.0
206
+ "Mongo::Operation::Write::Update::LegacyResult" # MongoDB 2.4
207
+ if multi
208
+ result.n == count
209
+ else
210
+ result.documents.first[:err].nil?
211
+ end
212
+ else
213
+ raise "Result class not currently supported: #{result.class.to_s}"
214
+ end
215
+ end
216
+
217
+ def create(collection, data)
218
+ assert_type(data, [Hash, Array])
219
+ # Single doc.
220
+ if data.is_a?(Hash)
221
+ data.merge!(Wgit::Model.common_insert_data)
222
+ result = @@client[collection.to_sym].insert_one(data)
223
+ unless write_succeeded?(result)
224
+ raise "DB write (insert) failed"
225
+ end
226
+ result.n
227
+ # Multiple docs.
228
+ elsif data.is_a?(Array)
229
+ assert_arr_types(data, Hash)
230
+ data.map! do |data_hash|
231
+ data_hash.merge(Wgit::Model.common_insert_data)
232
+ end
233
+ result = @@client[collection.to_sym].insert_many(data)
234
+ unless write_succeeded?(result, data.length)
235
+ raise "DB write(s) failed"
236
+ end
237
+ result.inserted_count
238
+ else
239
+ raise "data must be a Hash or an Array of Hash's"
240
+ end
241
+ end
242
+
243
+ def retrieve(collection, query, sort = {}, projection = {},
244
+ limit = 0, skip = 0)
245
+ assert_type(query, Hash)
246
+ @@client[collection.to_sym].find(query).projection(projection)
247
+ .skip(skip).limit(limit).sort(sort)
248
+ end
249
+
250
+ # NOTE: The Model.common_update_data should be merged in the calling
251
+ # method as the update param can be bespoke due to its nature.
252
+ def _update(single, collection, selection, update)
253
+ assert_arr_types([selection, update], Hash)
254
+ if single
255
+ result = @@client[collection.to_sym].update_one(selection, update)
256
+ else
257
+ result = @@client[collection.to_sym].update_many(selection, update)
258
+ end
259
+ raise "DB write (update) failed" unless write_succeeded?(result)
260
+ result.n
261
+ end
262
+
263
+ alias :count :size
264
+ alias :length :size
265
+ alias :insert_url :insert_urls
266
+ alias :insert_doc :insert_docs
267
+ alias :search_and_format :search_p
268
+ end
269
+ end
@@ -0,0 +1,31 @@
1
+ require_relative '../utils'
2
+
3
+ module Wgit
4
+
5
+ # @author Michael Telford
6
+ # Module containing the DB data model structure.
7
+ module Model
8
+ def self.url(url)
9
+ raise "url must respond to to_h" unless url.respond_to?(:to_h)
10
+ url.to_h
11
+ end
12
+
13
+ def self.document(doc)
14
+ raise "doc must respond to to_h" unless doc.respond_to?(:to_h)
15
+ doc.to_h(false)
16
+ end
17
+
18
+ def self.common_insert_data
19
+ {
20
+ :date_added => Wgit::Utils.time_stamp,
21
+ :date_modified => Wgit::Utils.time_stamp,
22
+ }
23
+ end
24
+
25
+ def self.common_update_data
26
+ {
27
+ :date_modified => Wgit::Utils.time_stamp,
28
+ }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,27 @@
1
+
2
+ # @author Michael Telford
3
+ module Wgit
4
+ DB_PROVIDER = :MongoLabs.freeze
5
+
6
+ # OpenShift (MongoDB 2.4)
7
+ if DB_PROVIDER == :OpenShift
8
+ CONNECTION_DETAILS = {
9
+ :host => "127.0.0.1",
10
+ :port => "27017",
11
+ :db => "admin",
12
+ :uname => "admin",
13
+ :pword => "R5jUKv1fessb"
14
+ }.freeze
15
+ # MongoLabs (MongoDB 3.0)
16
+ elsif DB_PROVIDER == :MongoLabs
17
+ CONNECTION_DETAILS = {
18
+ :host => "ds037205.mongolab.com",
19
+ :port => "37205",
20
+ :db => "crawler",
21
+ :uname => "rubyapp",
22
+ :pword => "R5jUKv1fessb",
23
+ }.freeze
24
+ else
25
+ raise "Database provider '#{DB_PROVIDER}' is not recognized"
26
+ end
27
+ end
@@ -0,0 +1,293 @@
1
+ require_relative 'url'
2
+ require_relative 'utils'
3
+ require_relative 'assertable'
4
+ require 'nokogiri'
5
+
6
+ module Wgit
7
+
8
+ # @author Michael Telford
9
+ # Class modeling a HTML web document. Also doubles as a search result.
10
+ class Document
11
+ include Assertable
12
+
13
+ TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
14
+ :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
15
+
16
+ attr_reader :url, :html, :title, :author, :keywords, :links, :text, :score
17
+
18
+ def initialize(url_or_doc, html = nil)
19
+ if (url_or_doc.is_a?(String))
20
+ assert_type(url_or_doc, Url)
21
+ html ||= ""
22
+
23
+ @url = url_or_doc
24
+ @html = html
25
+
26
+ @doc = Nokogiri::HTML(html) do |config|
27
+ # TODO: Remove #'s below when crawling in production.
28
+ #config.options = Nokogiri::XML::ParseOptions::STRICT |
29
+ # Nokogiri::XML::ParseOptions::NONET
30
+ end
31
+
32
+ init_title
33
+ init_author
34
+ init_keywords
35
+ init_links
36
+ init_text
37
+ @score = 0.0
38
+ else
39
+ # Init from a mongo collection document.
40
+ @url = Wgit::Url.new(url_or_doc[:url])
41
+ @html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
42
+ @title = url_or_doc[:title]
43
+ @author = url_or_doc[:author]
44
+ @keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
45
+ @links = url_or_doc[:links].nil? ? [] : url_or_doc[:links]
46
+ @links.map! { |link| Wgit::Url.new(link) }
47
+ @text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
48
+ @score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
49
+ end
50
+ end
51
+
52
+ def internal_links
53
+ return [] if @links.empty?
54
+ @links.reject do |link|
55
+ begin
56
+ not link.relative_link?
57
+ rescue
58
+ true
59
+ end
60
+ end
61
+ end
62
+
63
+ def internal_full_links
64
+ return [] if internal_links.empty?
65
+ internal_links.map do |link|
66
+ link.replace("/" + link) unless link.start_with?("/")
67
+ Wgit::Url.new(@url.to_base + link)
68
+ end
69
+ end
70
+
71
+ def external_links
72
+ return [] if @links.empty?
73
+ @links.reject do |link|
74
+ begin
75
+ link.relative_link?
76
+ rescue
77
+ true
78
+ end
79
+ end
80
+ end
81
+
82
+ def stats
83
+ hash = {}
84
+ instance_variables.each do |var|
85
+ # Add up the total bytes of text as well as the length.
86
+ if var == :@text
87
+ count = 0
88
+ @text.each { |t| count += t.length }
89
+ hash[:text_length] = @text.length
90
+ hash[:text_bytes] = count
91
+ # Else take the #length method return value.
92
+ else
93
+ next unless instance_variable_get(var).respond_to?(:length)
94
+ hash[var[1..-1].to_sym] =
95
+ instance_variable_get(var).send(:length)
96
+ end
97
+ end
98
+ hash
99
+ end
100
+
101
+ def size
102
+ stats[:html]
103
+ end
104
+
105
+ def to_h(include_html = false)
106
+ ignore = include_html ? [] : [:@html]
107
+ ignore << :@doc # Always ignore :@doc
108
+ Wgit::Utils.to_h(self, ignore)
109
+ end
110
+
111
+ # Override of the default == method, is equal if url and html both match.
112
+ # Use doc.object_id == other_doc.object_id for exact object comparison.
113
+ def ==(other_doc)
114
+ return false unless other_doc.is_a? Wgit::Document
115
+ url == other_doc.url and html == other_doc.html
116
+ end
117
+
118
+ # Shortcut for calling Document#html[range].
119
+ def [](range)
120
+ html[range]
121
+ end
122
+
123
+ def empty?
124
+ html.strip.empty?
125
+ end
126
+
127
+ # Searches against the Document#text for the given search text.
128
+ # The number of search hits for each sentenence are recorded internally
129
+ # and used to rank/sort the search results before being returned. Where
130
+ # the Database#search method search all documents for the most hits this
131
+ # method searches each documents text for the most hits.
132
+ #
133
+ # Each search result comprises of a sentence of a given length. The length
134
+ # will be based on the sentence_limit parameter or the full length of the
135
+ # original sentence, which ever is less. The algorithm obviously ensures
136
+ # that the search value is visible somewhere in the sentence.
137
+ #
138
+ # @param text [String] the value to search the document text against.
139
+ # @param sentence_limit [Fixnum] the length of each search result
140
+ # sentence.
141
+ #
142
+ # @return [Array] of String objects representing the search results.
143
+ def search(text, sentence_limit = 80)
144
+ raise "A search value must be provided" if text.empty?
145
+ raise "The sentence length value must be even" if sentence_limit.odd?
146
+
147
+ results = {}
148
+ regex = Regexp.new(text, Regexp::IGNORECASE)
149
+
150
+ @text.each do |sentence|
151
+ hits = sentence.scan(regex).count
152
+ if hits > 0
153
+ sentence.strip!
154
+ index = sentence.index(regex)
155
+ Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
156
+ results[sentence] = hits
157
+ end
158
+ end
159
+
160
+ return [] if results.empty?
161
+ results = Hash[results.sort_by { |k, v| v }]
162
+ results.keys.reverse
163
+ end
164
+
165
+ # Performs a text search (see search for details) but assigns the results
166
+ # to the @text instance variable. This can be used for sub search
167
+ # functionality. Note that there is no way of getting the original text
168
+ # back however.
169
+ def search!(text)
170
+ @text = search(text)
171
+ end
172
+
173
+ # Uses Nokogiri's xpath method to search the doc's html and return the
174
+ # results.
175
+ def xpath(xpath)
176
+ @doc.xpath(xpath)
177
+ end
178
+
179
+ private
180
+
181
+ def process_str(str)
182
+ str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
183
+ str.strip!
184
+ str # This is required to return the str, do not remove.
185
+ end
186
+
187
+ def process_arr(array)
188
+ assert_arr_types(array, String)
189
+ array.map! { |str| process_str(str) }
190
+ array.reject! { |str| str.empty? }
191
+ array.uniq!
192
+ end
193
+
194
+ # Modifies internal links by removing this doc's base or host url if
195
+ # present. http://www.google.co.uk/about.html (with or without the
196
+ # protocol prefix) will become about.html meaning it'll appear within
197
+ # internal_links.
198
+ def process_internal_links(links)
199
+ links.map! do |link|
200
+ host_or_base = if link.start_with?("http")
201
+ url.base
202
+ else
203
+ url.host
204
+ end
205
+ if link.start_with?(host_or_base)
206
+ link.sub!(host_or_base, "")
207
+ link.replace(link[1..-1]) if link.start_with?("/")
208
+ link.strip!
209
+ end
210
+ link
211
+ end
212
+ end
213
+
214
+ def text_elements_xpath
215
+ xpath = ""
216
+ return xpath if TEXT_ELEMENTS.empty?
217
+ el_xpath = "//%s/text()"
218
+ TEXT_ELEMENTS.each_with_index do |el, i|
219
+ xpath += " | " unless i == 0
220
+ xpath += el_xpath % [el]
221
+ end
222
+ xpath
223
+ end
224
+
225
+ def init_var(xpath, var, first_result = true)
226
+ results = @doc.xpath(xpath)
227
+ unless results.nil? || results.empty?
228
+ result = if first_result
229
+ results.first.content
230
+ else
231
+ results.map { |res| res.content }
232
+ end
233
+ instance_variable_set(var, result)
234
+ end
235
+ end
236
+
237
+ def init_title
238
+ @title = nil
239
+ xpath = "//title"
240
+ init_var(xpath, :@title)
241
+ process_str(@title) unless @title.nil?
242
+ end
243
+
244
+ def init_author
245
+ @author = nil
246
+ xpath = "//meta[@name='author']/@content"
247
+ init_var(xpath, :@author)
248
+ process_str(@author) unless @author.nil?
249
+ end
250
+
251
+ def init_keywords
252
+ @keywords = nil
253
+ xpath = "//meta[@name='keywords']/@content"
254
+ init_var(xpath, :@keywords)
255
+ return @keywords = [] unless @keywords
256
+ @keywords = @keywords.split(",")
257
+ process_arr(@keywords)
258
+ end
259
+
260
+ def init_links
261
+ @links = nil
262
+ xpath = "//a/@href"
263
+ init_var(xpath, :@links, false)
264
+ return @links = [] unless @links
265
+ process_arr(@links)
266
+ @links.reject! { |link| link == "/" }
267
+ @links.map! do |link|
268
+ begin
269
+ Wgit::Url.new(link)
270
+ rescue
271
+ nil
272
+ end
273
+ end
274
+ @links.reject! { |link| link.nil? }
275
+ process_internal_links(@links)
276
+ end
277
+
278
+ def init_text
279
+ @text = nil
280
+ xpath = text_elements_xpath
281
+ init_var(xpath, :@text, false)
282
+ return @text = [] unless @text
283
+ process_arr(@text)
284
+ end
285
+
286
+ alias :to_hash :to_h
287
+ alias :relative_links :internal_links
288
+ alias :relative_urls :internal_links
289
+ alias :relative_full_links :internal_full_links
290
+ alias :relative_full_urls :internal_full_links
291
+ alias :external_urls :external_links
292
+ end
293
+ end
data/lib/wgit/url.rb ADDED
@@ -0,0 +1,140 @@
1
+ require_relative 'utils'
2
+ require 'uri'
3
+
4
+ module Wgit
5
+
6
+ # @author Michael Telford
7
+ # Class modeling a web based URL.
8
+ # Can be an internal link e.g. "about.html"
9
+ # or a full URL e.g. "http://www.google.co.uk".
10
+ class Url < String
11
+ attr_accessor :crawled, :date_crawled
12
+
13
+ def initialize(url_or_doc, crawled = false, date_crawled = nil)
14
+ if (url_or_doc.is_a?(String))
15
+ url = url_or_doc
16
+ else
17
+ # Init from a mongo collection document.
18
+ url = url_or_doc[:url]
19
+ crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
20
+ date_crawled = url_or_doc[:date_crawled]
21
+ end
22
+ @uri = URI(url)
23
+ @crawled = crawled
24
+ @date_crawled = date_crawled
25
+ super(url)
26
+ end
27
+
28
+ def self.validate(url)
29
+ if Wgit::Url.relative_link?(url)
30
+ raise "Invalid url (or a relative link): #{url}"
31
+ end
32
+ unless url.start_with?("http://") or url.start_with?("https://")
33
+ raise "Invalid url (missing protocol prefix): #{url}"
34
+ end
35
+ if URI.regexp.match(url).nil?
36
+ raise "Invalid url: #{url}"
37
+ end
38
+ end
39
+
40
+ def self.valid?(url)
41
+ Wgit::Url.validate(url)
42
+ true
43
+ rescue
44
+ false
45
+ end
46
+
47
+ # Modifies the receiver url by prefixing it with a protocol.
48
+ # Returns the url whether its been modified or not.
49
+ def self.prefix_protocol(url, https = false)
50
+ unless url.start_with?("http://") or url.start_with?("https://")
51
+ if https
52
+ url.replace("https://#{url}")
53
+ else
54
+ url.replace("http://#{url}")
55
+ end
56
+ end
57
+ url
58
+ end
59
+
60
+ # URI.split("http://www.google.co.uk/about.html") returns the following:
61
+ # array[2]: "www.google.co.uk", array[5]: "/about.html".
62
+ # This means that all external links in a page are expected to have a
63
+ # protocol prefix e.g. "http://", otherwise the link is treated as an
64
+ # internal link (regardless of whether it is valid or not).
65
+ def self.relative_link?(link)
66
+ link_segs = URI.split(link)
67
+ if not link_segs[2].nil? and not link_segs[2].empty?
68
+ false
69
+ elsif not link_segs[5].nil? and not link_segs[5].empty?
70
+ true
71
+ else
72
+ raise "Invalid link: #{link}"
73
+ end
74
+ end
75
+
76
+ def self.concat(host, link)
77
+ url = host
78
+ url.chop! if url.end_with?("/")
79
+ link = link[1..-1] if link.start_with?("/")
80
+ Wgit::Url.new(url + "/" + link)
81
+ end
82
+
83
+ def relative_link?
84
+ Wgit::Url.relative_link?(self)
85
+ end
86
+
87
+ def valid?
88
+ Wgit::Url.valid?(self)
89
+ end
90
+
91
+ def concat(link)
92
+ Wgit::Url.concat(self, link)
93
+ end
94
+
95
+ def crawled=(bool)
96
+ @crawled = bool
97
+ @date_crawled = bool ? Wgit::Utils.time_stamp : nil
98
+ end
99
+
100
+ def to_uri
101
+ @uri
102
+ end
103
+
104
+ def to_url
105
+ self
106
+ end
107
+
108
+ # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
109
+ def to_host
110
+ Wgit::Url.new(@uri.host)
111
+ end
112
+
113
+ # URI.split("http://www.google.co.uk/about.html") returns the following:
114
+ # array[0]: "http://", array[2]: "www.google.co.uk".
115
+ # Returns array[0] + array[2] e.g. http://www.google.co.uk.
116
+ def to_base
117
+ if Wgit::Url.relative_link?(self)
118
+ raise "A relative link doesn't have a base URL: #{self}"
119
+ end
120
+ url_segs = URI.split(self)
121
+ if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
122
+ raise "Both a protocol and host are needed: #{self}"
123
+ end
124
+ base = "#{url_segs[0]}://#{url_segs[2]}"
125
+ Wgit::Url.new(base)
126
+ end
127
+
128
+ def to_h
129
+ ignore = [:@uri]
130
+ h = Wgit::Utils.to_h(self, ignore)
131
+ Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
132
+ end
133
+
134
+ alias :to_hash :to_h
135
+ alias :host :to_host
136
+ alias :base :to_base
137
+ alias :internal_link? :relative_link?
138
+ alias :crawled? :crawled
139
+ end
140
+ end
data/lib/wgit/utils.rb ADDED
@@ -0,0 +1,115 @@
1
+
2
+ module Wgit
3
+
4
+ # @author Michael Telford
5
+ # Utility module containing generic methods.
6
+ module Utils
7
+ def self.time_stamp
8
+ Time.new
9
+ end
10
+
11
+ # Returns a hash created from obj's instance vars and values.
12
+ def self.to_h(obj, ignore = [])
13
+ hash = {}
14
+ obj.instance_variables.each do |var|
15
+ next if ignore.include?(var)
16
+ hash[var[1..-1].to_sym] = obj.instance_variable_get(var)
17
+ end
18
+ hash
19
+ end
20
+
21
+ # Improved each method which takes care of singleton and enumerable
22
+ # objects. Yields one or more objects.
23
+ def self.each(obj_or_objs)
24
+ if obj_or_objs.respond_to?(:each)
25
+ obj_or_objs.each { |obj| yield obj }
26
+ else
27
+ yield obj_or_objs
28
+ end
29
+ end
30
+
31
+ # Formats the sentence (modifies the receiver) and returns its value.
32
+ # The length will be based on the sentence_limit parameter or the full
33
+ # length of the original sentence, which ever is less. The full sentence
34
+ # is returned if the sentence_limit is 0. The algorithm obviously ensures
35
+ # that the search value is visible somewhere in the sentence.
36
+ def self.format_sentence_length(sentence, index, sentence_limit)
37
+ raise "A sentence value must be provided" if sentence.empty?
38
+ raise "The sentence length value must be even" if sentence_limit.odd?
39
+ if index < 0 or index > sentence.length
40
+ raise "Incorrect index value: #{index}"
41
+ end
42
+
43
+ return sentence if sentence_limit == 0
44
+
45
+ start = 0
46
+ finish = sentence.length
47
+
48
+ if sentence.length > sentence_limit
49
+ start = index - (sentence_limit / 2)
50
+ finish = index + (sentence_limit / 2)
51
+
52
+ if start < 0
53
+ diff = 0 - start
54
+ if (finish + diff) > sentence.length
55
+ finish = sentence.length
56
+ else
57
+ finish += diff
58
+ end
59
+ start = 0
60
+ elsif finish > sentence.length
61
+ diff = finish - sentence.length
62
+ if (start - diff) < 0
63
+ start = 0
64
+ else
65
+ start -= diff
66
+ end
67
+ finish = sentence.length
68
+ end
69
+
70
+ raise if sentence[start..(finish - 1)].length != sentence_limit
71
+ end
72
+
73
+ sentence.replace(sentence[start..(finish - 1)])
74
+ end
75
+
76
+ # Prints out the search results in a search engine page format.
77
+ # Most of the params are passed to Document#search - see class docs.
78
+ # The steam param decides where the printf output is written to, and
79
+ # therefore must respond_to? :puts
80
+ # The format for each result is:
81
+ #
82
+ # Title
83
+ # Keywords (if there are some)
84
+ # Text Snippet (showing the searched for text if provided)
85
+ # Url
86
+ # <empty_line>
87
+ def self.printf_search_results(results, text = nil, case_sensitive = false,
88
+ sentence_length = 80, keyword_count = 5,
89
+ stream = Kernel)
90
+ raise "stream must respond_to? :puts" unless stream.respond_to? :puts
91
+ keyword_count -= 1 # Because Array's are zero indexed.
92
+
93
+ results.each do |doc|
94
+ sentence = if text.nil?
95
+ nil
96
+ else
97
+ sentence = doc.search(text, sentence_length).first
98
+ if sentence.nil?
99
+ nil
100
+ else
101
+ sentence.strip.empty? ? nil : sentence
102
+ end
103
+ end
104
+ stream.puts doc.title
105
+ unless doc.keywords.empty?
106
+ stream.puts doc.keywords[0..keyword_count].join(", ")
107
+ end
108
+ stream.puts sentence unless sentence.nil?
109
+ stream.puts doc.url
110
+ stream.puts
111
+ end
112
+ nil
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,3 @@
1
+ module Wgit
2
+ VERSION = "0.0.1".freeze
3
+ end
@@ -0,0 +1,134 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'crawler'
4
+ require_relative 'database/database'
5
+
6
+ # @author Michael Telford
7
+ module Wgit
8
+
9
+ # Convience method to crawl the World Wide Web.
10
+ # The default value (-1) for max_sites_to_crawl is unrestricted.
11
+ # The default max_data_size is 1GB.
12
+ def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
13
+ db = Wgit::Database.new
14
+ web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
15
+ web_crawler.crawl_the_web
16
+ end
17
+
18
+ # Class which sets up a crawler and saves the indexed
19
+ # docs to a database. Will crawl the web forever if you let it :-)
20
+ class WebCrawler
21
+ attr_accessor :max_sites_to_crawl, :max_data_size
22
+ attr_reader :crawler, :db
23
+
24
+ def initialize(database,
25
+ max_sites_to_crawl = -1,
26
+ max_data_size = 1048576000)
27
+ @crawler = Wgit::Crawler.new
28
+ @db = database
29
+ @max_sites_to_crawl = max_sites_to_crawl
30
+ @max_data_size = max_data_size
31
+ end
32
+
33
+ # Retrieves url's from the database and recursively crawls each site
34
+ # storing their internal pages into the database and adding their external
35
+ # url's to be crawled at a later date.
36
+ def crawl_the_web
37
+ if max_sites_to_crawl < 0
38
+ puts "Crawling until the database has been filled or it runs out of \
39
+ urls to crawl (which might be never)."
40
+ end
41
+ loop_count = 0
42
+
43
+ while keep_crawling?(loop_count) do
44
+ puts "Current database size: #{db.size}"
45
+ crawler.urls = db.uncrawled_urls
46
+
47
+ if crawler.urls.empty?
48
+ puts "No urls to crawl, exiting."
49
+ break
50
+ end
51
+ puts "Starting crawl loop for: #{crawler.urls}"
52
+
53
+ docs_count = 0
54
+ urls_count = 0
55
+
56
+ crawler.urls.each do |url|
57
+ unless keep_crawling?(loop_count)
58
+ puts "Reached max number of sites to crawl or database \
59
+ capacity, exiting."
60
+ return
61
+ end
62
+ loop_count += 1
63
+
64
+ url.crawled = true
65
+ raise unless db.update(url) == 1
66
+
67
+ site_docs_count = 0
68
+ ext_links = crawler.crawl_site(url) do |doc|
69
+ unless doc.empty?
70
+ if write_doc_to_db(doc)
71
+ docs_count += 1
72
+ site_docs_count += 1
73
+ end
74
+ end
75
+ end
76
+
77
+ urls_count += write_urls_to_db(ext_links)
78
+ puts "Crawled and saved #{site_docs_count} docs for the \
79
+ site: #{url}"
80
+ end
81
+
82
+ puts "Crawled and saved docs for #{docs_count} url(s) overall for \
83
+ this iteration."
84
+ puts "Found and saved #{urls_count} external url(s) for the next \
85
+ iteration."
86
+ end
87
+ end
88
+
89
+ private
90
+
91
+ # Keep crawling or not based on DB size and current loop interation.
92
+ def keep_crawling?(loop_count)
93
+ return false if db.size >= max_data_size
94
+ # If max_sites_to_crawl is -1 for example then crawl away.
95
+ if max_sites_to_crawl < 0
96
+ true
97
+ else
98
+ loop_count < max_sites_to_crawl
99
+ end
100
+ end
101
+
102
+ # The unique url index on the documents collection prevents duplicate
103
+ # inserts.
104
+ def write_doc_to_db(doc)
105
+ db.insert(doc)
106
+ puts "Saved document for url: #{doc.url}"
107
+ true
108
+ rescue Mongo::Error::OperationFailure
109
+ puts "Document already exists: #{doc.url}"
110
+ false
111
+ end
112
+
113
+ # The unique url index on the urls collection prevents duplicate inserts.
114
+ def write_urls_to_db(urls)
115
+ count = 0
116
+ if urls.respond_to?(:each)
117
+ urls.each do |url|
118
+ begin
119
+ db.insert(url)
120
+ count += 1
121
+ puts "Inserted url: #{url}"
122
+ rescue Mongo::Error::OperationFailure
123
+ puts "Url already exists: #{url}"
124
+ end
125
+ end
126
+ end
127
+ count
128
+ end
129
+ end
130
+ end
131
+
132
+ if __FILE__ == $0
133
+ Wgit.crawl_the_web
134
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wgit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Michael Telford
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-07 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves their
14
+ page contents for later use. Also included in this package is a means to search
15
+ indexed documents stored in a database. Therefore this library provides the main
16
+ components of a WWW search engine. You can also use Wgit to copy entire website's
17
+ HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
18
+ you to easily pull out the parts of a webpage that are important to you, the CSS
19
+ or JS links for example.
20
+ email: michael.telford@live.com
21
+ executables: []
22
+ extensions: []
23
+ extra_rdoc_files: []
24
+ files:
25
+ - "./lib/wgit.rb"
26
+ - "./lib/wgit/assertable.rb"
27
+ - "./lib/wgit/core_ext.rb"
28
+ - "./lib/wgit/crawler.rb"
29
+ - "./lib/wgit/database/database.rb"
30
+ - "./lib/wgit/database/model.rb"
31
+ - "./lib/wgit/database/mongo_connection_details.rb"
32
+ - "./lib/wgit/document.rb"
33
+ - "./lib/wgit/url.rb"
34
+ - "./lib/wgit/utils.rb"
35
+ - "./lib/wgit/version.rb"
36
+ - "./lib/wgit/web_crawler.rb"
37
+ homepage: http://rubygems.org/gems/wgit
38
+ licenses:
39
+ - MIT
40
+ metadata:
41
+ allowed_push_host: https://rubygems.org
42
+ post_install_message:
43
+ rdoc_options: []
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ requirements: []
57
+ rubyforge_project:
58
+ rubygems_version: 2.4.5
59
+ signing_key:
60
+ specification_version: 4
61
+ summary: Wgit is wget on steroids with an easy to use API.
62
+ test_files: []