wgit 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 82f33e00a273c6cdeb3ba9c171110d849fff2428
4
+ data.tar.gz: 14c63f826d1d21811b14e9f3a2bca750b3f4afa3
5
+ SHA512:
6
+ metadata.gz: 7c42b925f72d9e7cceba79d9aee764f97b6537c0005038501a1f75c36b1bcd3b6036cfb9b62fcf01fd435e0348c1e8c00c445a291051c068fa58184de2c9590a
7
+ data.tar.gz: a2a756c3be7b9b214921bfdac5846a2250e452265285cb9c3b812d2eaefc2ab969b608cd1841f34507a6ef184f20ba7c98658daf0135fb85eead88de0356320f
data/lib/wgit.rb ADDED
@@ -0,0 +1,11 @@
1
+ require_relative 'wgit/version'
2
+ require_relative 'wgit/crawler'
3
+ require_relative 'wgit/web_crawler'
4
+ require_relative 'wgit/url'
5
+ require_relative 'wgit/document'
6
+ require_relative 'wgit/utils'
7
+ require_relative 'wgit/assertable'
8
+ require_relative 'wgit/database/database'
9
+ require_relative 'wgit/database/model'
10
+ require_relative 'wgit/database/mongo_connection_details'
11
+ #require_relative 'wgit/core_ext'
@@ -0,0 +1,69 @@
1
+
2
+ module Wgit
3
+
4
+ # @author Michael Telford
5
+ # Module containing assert methods including type checking which can be used
6
+ # for asserting the integrity of method definitions etc.
7
+ module Assertable
8
+ DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
9
+ WRONG_METHOD_MSG = "arr must be Enumerable, use a different method"
10
+ DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
11
+
12
+ # obj.instance_of? must return true for one of the types listed in
13
+ # type_or_types or an exception is thrown using msg if provided.
14
+ # type_or_types can be a single Class or an Enumerable of Class objects,
15
+ # Strings and Symbols will not work.
16
+ def assert_types(obj, type_or_types, msg = nil)
17
+ msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
18
+ if type_or_types.respond_to?(:any?)
19
+ match = type_or_types.any? { |type| obj.instance_of?(type) }
20
+ else
21
+ match = obj.instance_of?(type_or_types)
22
+ end
23
+ raise msg unless match
24
+ obj
25
+ end
26
+
27
+ # Each object within arr must match one of the types listed in
28
+ # type_or_types or an exception is thrown using msg if provided.
29
+ # type_or_types can be a single Class or an Enumerable of Class objects,
30
+ # Strings and Symbols will not work.
31
+ def assert_arr_types(arr, type_or_types, msg = nil)
32
+ raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
33
+ arr.each do |obj|
34
+ assert_types(obj, type_or_types, msg)
35
+ end
36
+ end
37
+
38
+ # The obj_or_objs must respond_to? all of the given methods or an
39
+ # Exception is raised using msg or a default message.
40
+ # Returns obj_or_objs on sucessful assertion.
41
+ def assert_respond_to(obj_or_objs, methods, msg = nil)
42
+ if obj_or_objs.respond_to?(:each)
43
+ obj_or_objs.each do |obj|
44
+ _assert_respond_to(obj, methods, msg)
45
+ end
46
+ else
47
+ _assert_respond_to(obj_or_objs, methods, msg)
48
+ end
49
+ obj_or_objs
50
+ end
51
+
52
+ private
53
+
54
+ def _assert_respond_to(obj, methods, msg = nil)
55
+ msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
56
+ match = methods.all? { |method| obj.respond_to?(method) }
57
+ raise msg unless match
58
+ obj
59
+ end
60
+
61
+ alias :assert_type :assert_types
62
+ alias :type :assert_types
63
+ alias :types :assert_types
64
+ alias :assert_arr_type :assert_arr_types
65
+ alias :arr_type :assert_arr_types
66
+ alias :arr_types :assert_arr_types
67
+ alias :respond_to :assert_respond_to
68
+ end
69
+ end
@@ -0,0 +1,40 @@
1
+ require_relative 'url'
2
+
3
+ # @author Michael Telford
4
+ # Script which extends Ruby's core functionality when parsed.
5
+ # Needs to be required separately using `require 'wgit/core_ext'`.
6
+
7
+ class String
8
+ # Converts a String into a Wgit::Url object.
9
+ def to_url
10
+ Wgit::Url.new(self)
11
+ end
12
+ end
13
+
14
+ module Enumerable
15
+ # Converts each String instance into a Wgit::Url object and returns the new
16
+ # array.
17
+ def to_urls
18
+ map do |element|
19
+ process_url_element(element)
20
+ end
21
+ end
22
+
23
+ # Converts each String instance into a Wgit::Url object and returns the
24
+ # updated array.
25
+ def to_urls!
26
+ map! do |element|
27
+ process_url_element(element)
28
+ end
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def process_url_element(element)
35
+ if element.is_a? String
36
+ element.to_url
37
+ else
38
+ element
39
+ end
40
+ end
@@ -0,0 +1,132 @@
1
+ require_relative 'url'
2
+ require_relative 'document'
3
+ require_relative 'utils'
4
+ require_relative 'assertable'
5
+ require 'net/http' # requires 'uri'
6
+
7
+ module Wgit
8
+
9
+ # @author Michael Telford
10
+ # Crawler class provides a means of crawling web URL's.
11
+ # Note that any redirects will not be followed for during crawling
12
+ # functionality.
13
+ class Crawler
14
+ include Assertable
15
+
16
+ attr_reader :urls, :docs
17
+
18
+ def initialize(*urls)
19
+ self.urls = urls unless urls.nil?
20
+ @docs = []
21
+ end
22
+
23
+ def urls=(urls)
24
+ @urls = []
25
+ Wgit::Utils.each(urls) { |url| add_url(url) }
26
+ end
27
+
28
+ def [](*urls)
29
+ self.urls = urls unless urls.nil?
30
+ end
31
+
32
+ def <<(url)
33
+ add_url(url)
34
+ end
35
+
36
+ # Crawls individual urls, not entire sites.
37
+ # Returns the last crawled doc.
38
+ # Yields each doc to the provided block or adds each doc to @docs
39
+ # which can be accessed by Crawler#docs after the method returns.
40
+ def crawl_urls(urls = @urls, &block)
41
+ raise "No urls to crawl" unless urls
42
+ @docs = []
43
+ doc = nil
44
+ Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
45
+ doc ? doc : @docs.last
46
+ end
47
+
48
+ # Crawl the url and return the response document or nil.
49
+ # Also yield(doc) if a block is provided. The doc is passed to the block
50
+ # regardless of the crawl success so the doc.url can be used if needed.
51
+ def crawl_url(url = @urls.first, &block)
52
+ assert_type(url, Url)
53
+ markup = fetch(url)
54
+ url.crawled = true
55
+ doc = Wgit::Document.new(url, markup)
56
+ block.call(doc) if block_given?
57
+ doc.empty? ? nil : doc
58
+ end
59
+
60
+ # Crawls an entire site by recursively going through its internal_links.
61
+ # Also yield(doc) for each crawled doc if a block is provided.
62
+ # A block is the only way to interact with the crawled docs.
63
+ # Returns a unique array of external urls collected from the site
64
+ # or nil if the base_url could not be crawled successfully.
65
+ def crawl_site(base_url = @urls.first, &block)
66
+ assert_type(base_url, Url)
67
+
68
+ doc = crawl_url(base_url, &block)
69
+ return nil if doc.nil?
70
+
71
+ crawled_urls = []
72
+ external_urls = doc.external_links
73
+ internal_urls = doc.internal_links
74
+
75
+ return doc.external_links.uniq if internal_urls.empty?
76
+
77
+ loop do
78
+ internal_urls.uniq! unless internal_urls.uniq.nil?
79
+
80
+ links = internal_urls - crawled_urls
81
+ break if links.empty?
82
+
83
+ links.each do |link|
84
+ doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
85
+ crawled_urls << link
86
+ next if doc.nil?
87
+ internal_urls.concat(doc.internal_links)
88
+ external_urls.concat(doc.external_links)
89
+ end
90
+ end
91
+
92
+ external_urls.uniq
93
+ end
94
+
95
+ private
96
+
97
+ # Add the document to the @docs array for later processing
98
+ # or let the block process it here and now.
99
+ def handle_crawl_block(url, &block)
100
+ if not block_given?
101
+ @docs << crawl_url(url)
102
+ nil
103
+ else
104
+ crawl_url(url, &block)
105
+ end
106
+ end
107
+
108
+ # The fetch method performs a HTTP GET to obtain the HTML document.
109
+ # Invalid urls or any HTTP response that doesn't return a HTML body
110
+ # will be ignored and nil will be returned. This means that redirects
111
+ # etc. will not be followed.
112
+ def fetch(url)
113
+ raise unless url.respond_to?(:to_uri)
114
+ res = Net::HTTP.get_response(url.to_uri)
115
+ res.body.empty? ? nil : res.body
116
+ rescue
117
+ nil
118
+ end
119
+
120
+ def add_url(url)
121
+ @urls = [] if @urls.nil?
122
+ if url.instance_of?(Url)
123
+ @urls << url
124
+ else
125
+ @urls << Wgit::Url.new(url)
126
+ end
127
+ end
128
+
129
+ alias :crawl :crawl_urls
130
+ alias :crawl_r :crawl_site
131
+ end
132
+ end
@@ -0,0 +1,269 @@
1
+ require_relative '../document'
2
+ require_relative '../url'
3
+ require_relative '../utils'
4
+ require_relative '../assertable'
5
+ require_relative 'mongo_connection_details'
6
+ require_relative 'model'
7
+ require 'mongo'
8
+
9
+ module Wgit
10
+
11
+ # @author Michael Telford
12
+ # Class modeling a DB connection and CRUD operations for the Url and
13
+ # Document collections.
14
+ # The most common methods are: insert, update, urls, search, stats, size.
15
+ class Database
16
+ include Assertable
17
+
18
+ # Is relative to the root project folder, not this file.
19
+ LOG_FILE_PATH = "misc/mongo_log.txt"
20
+
21
+ def initialize
22
+ conn_details = Wgit::CONNECTION_DETAILS
23
+ if conn_details.empty?
24
+ raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
25
+ :port, :db, :uname, :pword for a database connection to be established."
26
+ end
27
+
28
+ logger = Logger.new(LOG_FILE_PATH)
29
+ address = "#{conn_details[:host]}:#{conn_details[:port]}"
30
+ @@client = Mongo::Client.new([address],
31
+ :database => conn_details[:db],
32
+ :user => conn_details[:uname],
33
+ :password => conn_details[:pword],
34
+ :logger => logger,
35
+ :truncate_logs => false)
36
+ end
37
+
38
+ ### Create Data ###
39
+
40
+ def insert(data)
41
+ if data.is_a?(Url)
42
+ insert_urls(data)
43
+ elsif data.is_a?(Document)
44
+ insert_docs(data)
45
+ elsif data.respond_to?(:first)
46
+ if data.first.is_a?(Url)
47
+ insert_urls(data)
48
+ else
49
+ insert_docs(data)
50
+ end
51
+ else
52
+ raise "data is not in the correct format (all Url's or Document's)"
53
+ end
54
+ end
55
+
56
+ def insert_urls(url_or_urls)
57
+ unless url_or_urls.respond_to?(:map)
58
+ assert_type(url_or_urls, Url)
59
+ url_or_urls = Wgit::Model.url(url_or_urls)
60
+ else
61
+ assert_arr_types(url_or_urls, Url)
62
+ url_or_urls = url_or_urls.map do |url|
63
+ Wgit::Model.url(url)
64
+ end
65
+ end
66
+ create(:urls, url_or_urls)
67
+ end
68
+
69
+ def insert_docs(doc_or_docs)
70
+ unless doc_or_docs.respond_to?(:map)
71
+ assert_type(doc_or_docs, [Document, Hash])
72
+ unless doc_or_docs.is_a?(Hash)
73
+ doc_or_docs = Wgit::Model.document(doc_or_docs)
74
+ end
75
+ else
76
+ assert_arr_types(doc_or_docs, [Document, Hash])
77
+ doc_or_docs = doc_or_docs.map do |doc|
78
+ Wgit::Model.document(doc) unless doc.is_a?(Hash)
79
+ end
80
+ end
81
+ create(:documents, doc_or_docs)
82
+ end
83
+
84
+ ### Retrieve Data ###
85
+
86
+ # A crawled parameter value of nil (the default) returns all urls.
87
+ # A limit of 0 means all urls are returned.
88
+ # All urls are sorted by date_added ascending, in other words the first
89
+ # url in the results is the first added.
90
+ def urls(crawled = nil, limit = 0, skip = 0, &block)
91
+ crawled.nil? ? query = {} : query = { :crawled => crawled }
92
+
93
+ sort = { :date_added => 1 }
94
+ results = retrieve(:urls, query, sort, {}, limit, skip)
95
+ return [] if results.count < 1
96
+
97
+ # results.respond_to? :map! is false so we use map and overwrite the var.
98
+ results = results.map { |url_doc| Wgit::Url.new(url_doc) }
99
+ return results unless block_given?
100
+ results.each { |url| block.call(url) }
101
+ end
102
+
103
+ def crawled_urls(limit = 0, skip = 0, &block)
104
+ urls(true, limit, skip, &block)
105
+ end
106
+
107
+ def uncrawled_urls(limit = 0, skip = 0, &block)
108
+ urls(false, limit, skip, &block)
109
+ end
110
+
111
+ # Currently all searches are case insensitive.
112
+ #
113
+ # Searches against the indexed docs in the DB for the given text.
114
+ # The searched fields are decided by the text index setup against the
115
+ # documents collection. Currently we search against the following fields:
116
+ # "author", "keywords", "title" and "text".
117
+ #
118
+ # The MongoDB search ranks/sorts the results in order (highest first) based
119
+ # upon each documents textScore which records the number of text hits. We
120
+ # then store this textScore in each Document object for use elsewhere if
121
+ # needed.
122
+ #
123
+ # @param text [String] the value to search the data against.
124
+ # @param whole_sentence [Boolean] whether multiple words should be
125
+ # searched for separately.
126
+ # @param limit [Fixnum] the max length/count of the results array.
127
+ # @param skip [Fixnum] the number of results to skip, starting with the
128
+ # most relevant based upon the textScore of the search.
129
+ # @param block [Block] a block which if provided is passed to each result.
130
+ #
131
+ # @return [Array] of Document objects representing the search results.
132
+ def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
133
+ text.strip!
134
+ text.replace("\"" + text + "\"") if whole_sentence
135
+
136
+ # The textScore sorts based on the most search hits.
137
+ # We use the textScore hash as a sort and a projection below.
138
+ # :$caseSensitive => case_sensitive, # 3.2+ only.
139
+ sort_proj = { :score => { :$meta => "textScore" } }
140
+ query = { :$text => { :$search => text } }
141
+ results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
142
+
143
+ return [] if results.count < 1
144
+ # results.respond_to? :map! is false so we use map and overwrite the var.
145
+ results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
146
+ return results unless block_given?
147
+ results.each { |doc| block.call(doc) }
148
+ end
149
+
150
+ # Performs a search and pretty prints the results.
151
+ def search_p(text, whole_sentence = false, limit = 10,
152
+ skip = 0, sentence_length = 80, &block)
153
+ results = search(text, whole_sentence, limit, skip, &block)
154
+ Wgit::Utils.printf_search_results(results, text, false, sentence_length)
155
+ end
156
+
157
+ # Returns a Mongo object which can be used like a Hash to retrieve values.
158
+ def stats
159
+ @@client.command(:dbStats => 0).documents[0]
160
+ end
161
+
162
+ def size
163
+ stats[:dataSize]
164
+ end
165
+
166
+ ### Update Data ###
167
+
168
+ def update(data)
169
+ if data.is_a?(Url)
170
+ update_url(data)
171
+ elsif data.is_a?(Document)
172
+ update_doc(data)
173
+ else
174
+ raise "data is not in the correct format (all Url's or Document's)"
175
+ end
176
+ end
177
+
178
+ def update_url(url)
179
+ assert_type(url, Url)
180
+ selection = { :url => url }
181
+ url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
182
+ update = { "$set" => url_hash }
183
+ _update(true, :urls, selection, update)
184
+ end
185
+
186
+ def update_doc(doc)
187
+ assert_type(doc, Document)
188
+ selection = { :url => doc.url }
189
+ doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
190
+ update = { "$set" => doc_hash }
191
+ _update(true, :documents, selection, update)
192
+ end
193
+
194
+ private
195
+
196
+ def write_succeeded?(result, count = 1, multi = false)
197
+ case result.class.to_s
198
+ # Single create result.
199
+ when "Mongo::Operation::Write::Insert::Result"
200
+ result.documents.first[:err].nil?
201
+ # Multiple create result.
202
+ when "Mongo::BulkWrite::Result"
203
+ result.inserted_count == count
204
+ # Single and multiple update result.
205
+ when "Mongo::Operation::Write::Update::Result", # MongoDB 3.0
206
+ "Mongo::Operation::Write::Update::LegacyResult" # MongoDB 2.4
207
+ if multi
208
+ result.n == count
209
+ else
210
+ result.documents.first[:err].nil?
211
+ end
212
+ else
213
+ raise "Result class not currently supported: #{result.class.to_s}"
214
+ end
215
+ end
216
+
217
+ def create(collection, data)
218
+ assert_type(data, [Hash, Array])
219
+ # Single doc.
220
+ if data.is_a?(Hash)
221
+ data.merge!(Wgit::Model.common_insert_data)
222
+ result = @@client[collection.to_sym].insert_one(data)
223
+ unless write_succeeded?(result)
224
+ raise "DB write (insert) failed"
225
+ end
226
+ result.n
227
+ # Multiple docs.
228
+ elsif data.is_a?(Array)
229
+ assert_arr_types(data, Hash)
230
+ data.map! do |data_hash|
231
+ data_hash.merge(Wgit::Model.common_insert_data)
232
+ end
233
+ result = @@client[collection.to_sym].insert_many(data)
234
+ unless write_succeeded?(result, data.length)
235
+ raise "DB write(s) failed"
236
+ end
237
+ result.inserted_count
238
+ else
239
+ raise "data must be a Hash or an Array of Hash's"
240
+ end
241
+ end
242
+
243
+ def retrieve(collection, query, sort = {}, projection = {},
244
+ limit = 0, skip = 0)
245
+ assert_type(query, Hash)
246
+ @@client[collection.to_sym].find(query).projection(projection)
247
+ .skip(skip).limit(limit).sort(sort)
248
+ end
249
+
250
+ # NOTE: The Model.common_update_data should be merged in the calling
251
+ # method as the update param can be bespoke due to its nature.
252
+ def _update(single, collection, selection, update)
253
+ assert_arr_types([selection, update], Hash)
254
+ if single
255
+ result = @@client[collection.to_sym].update_one(selection, update)
256
+ else
257
+ result = @@client[collection.to_sym].update_many(selection, update)
258
+ end
259
+ raise "DB write (update) failed" unless write_succeeded?(result)
260
+ result.n
261
+ end
262
+
263
+ alias :count :size
264
+ alias :length :size
265
+ alias :insert_url :insert_urls
266
+ alias :insert_doc :insert_docs
267
+ alias :search_and_format :search_p
268
+ end
269
+ end
@@ -0,0 +1,31 @@
1
+ require_relative '../utils'
2
+
3
+ module Wgit
4
+
5
+ # @author Michael Telford
6
+ # Module containing the DB data model structure.
7
+ module Model
8
+ def self.url(url)
9
+ raise "url must respond to to_h" unless url.respond_to?(:to_h)
10
+ url.to_h
11
+ end
12
+
13
+ def self.document(doc)
14
+ raise "doc must respond to to_h" unless doc.respond_to?(:to_h)
15
+ doc.to_h(false)
16
+ end
17
+
18
+ def self.common_insert_data
19
+ {
20
+ :date_added => Wgit::Utils.time_stamp,
21
+ :date_modified => Wgit::Utils.time_stamp,
22
+ }
23
+ end
24
+
25
+ def self.common_update_data
26
+ {
27
+ :date_modified => Wgit::Utils.time_stamp,
28
+ }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,27 @@
1
+
2
+ # @author Michael Telford
3
+ module Wgit
4
+ DB_PROVIDER = :MongoLabs.freeze
5
+
6
+ # OpenShift (MongoDB 2.4)
7
+ if DB_PROVIDER == :OpenShift
8
+ CONNECTION_DETAILS = {
9
+ :host => "127.0.0.1",
10
+ :port => "27017",
11
+ :db => "admin",
12
+ :uname => "admin",
13
+ :pword => "R5jUKv1fessb"
14
+ }.freeze
15
+ # MongoLabs (MongoDB 3.0)
16
+ elsif DB_PROVIDER == :MongoLabs
17
+ CONNECTION_DETAILS = {
18
+ :host => "ds037205.mongolab.com",
19
+ :port => "37205",
20
+ :db => "crawler",
21
+ :uname => "rubyapp",
22
+ :pword => "R5jUKv1fessb",
23
+ }.freeze
24
+ else
25
+ raise "Database provider '#{DB_PROVIDER}' is not recognized"
26
+ end
27
+ end
@@ -0,0 +1,293 @@
1
+ require_relative 'url'
2
+ require_relative 'utils'
3
+ require_relative 'assertable'
4
+ require 'nokogiri'
5
+
6
+ module Wgit
7
+
8
+ # @author Michael Telford
9
+ # Class modeling a HTML web document. Also doubles as a search result.
10
+ class Document
11
+ include Assertable
12
+
13
+ TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
14
+ :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
15
+
16
+ attr_reader :url, :html, :title, :author, :keywords, :links, :text, :score
17
+
18
+ def initialize(url_or_doc, html = nil)
19
+ if (url_or_doc.is_a?(String))
20
+ assert_type(url_or_doc, Url)
21
+ html ||= ""
22
+
23
+ @url = url_or_doc
24
+ @html = html
25
+
26
+ @doc = Nokogiri::HTML(html) do |config|
27
+ # TODO: Remove #'s below when crawling in production.
28
+ #config.options = Nokogiri::XML::ParseOptions::STRICT |
29
+ # Nokogiri::XML::ParseOptions::NONET
30
+ end
31
+
32
+ init_title
33
+ init_author
34
+ init_keywords
35
+ init_links
36
+ init_text
37
+ @score = 0.0
38
+ else
39
+ # Init from a mongo collection document.
40
+ @url = Wgit::Url.new(url_or_doc[:url])
41
+ @html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
42
+ @title = url_or_doc[:title]
43
+ @author = url_or_doc[:author]
44
+ @keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
45
+ @links = url_or_doc[:links].nil? ? [] : url_or_doc[:links]
46
+ @links.map! { |link| Wgit::Url.new(link) }
47
+ @text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
48
+ @score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
49
+ end
50
+ end
51
+
52
+ def internal_links
53
+ return [] if @links.empty?
54
+ @links.reject do |link|
55
+ begin
56
+ not link.relative_link?
57
+ rescue
58
+ true
59
+ end
60
+ end
61
+ end
62
+
63
+ def internal_full_links
64
+ return [] if internal_links.empty?
65
+ internal_links.map do |link|
66
+ link.replace("/" + link) unless link.start_with?("/")
67
+ Wgit::Url.new(@url.to_base + link)
68
+ end
69
+ end
70
+
71
+ def external_links
72
+ return [] if @links.empty?
73
+ @links.reject do |link|
74
+ begin
75
+ link.relative_link?
76
+ rescue
77
+ true
78
+ end
79
+ end
80
+ end
81
+
82
+ def stats
83
+ hash = {}
84
+ instance_variables.each do |var|
85
+ # Add up the total bytes of text as well as the length.
86
+ if var == :@text
87
+ count = 0
88
+ @text.each { |t| count += t.length }
89
+ hash[:text_length] = @text.length
90
+ hash[:text_bytes] = count
91
+ # Else take the #length method return value.
92
+ else
93
+ next unless instance_variable_get(var).respond_to?(:length)
94
+ hash[var[1..-1].to_sym] =
95
+ instance_variable_get(var).send(:length)
96
+ end
97
+ end
98
+ hash
99
+ end
100
+
101
+ def size
102
+ stats[:html]
103
+ end
104
+
105
+ def to_h(include_html = false)
106
+ ignore = include_html ? [] : [:@html]
107
+ ignore << :@doc # Always ignore :@doc
108
+ Wgit::Utils.to_h(self, ignore)
109
+ end
110
+
111
+ # Override of the default == method, is equal if url and html both match.
112
+ # Use doc.object_id == other_doc.object_id for exact object comparison.
113
+ def ==(other_doc)
114
+ return false unless other_doc.is_a? Wgit::Document
115
+ url == other_doc.url and html == other_doc.html
116
+ end
117
+
118
+ # Shortcut for calling Document#html[range].
119
+ def [](range)
120
+ html[range]
121
+ end
122
+
123
+ def empty?
124
+ html.strip.empty?
125
+ end
126
+
127
+ # Searches against the Document#text for the given search text.
128
+ # The number of search hits for each sentenence are recorded internally
129
+ # and used to rank/sort the search results before being returned. Where
130
+ # the Database#search method search all documents for the most hits this
131
+ # method searches each documents text for the most hits.
132
+ #
133
+ # Each search result comprises of a sentence of a given length. The length
134
+ # will be based on the sentence_limit parameter or the full length of the
135
+ # original sentence, which ever is less. The algorithm obviously ensures
136
+ # that the search value is visible somewhere in the sentence.
137
+ #
138
+ # @param text [String] the value to search the document text against.
139
+ # @param sentence_limit [Fixnum] the length of each search result
140
+ # sentence.
141
+ #
142
+ # @return [Array] of String objects representing the search results.
143
+ def search(text, sentence_limit = 80)
144
+ raise "A search value must be provided" if text.empty?
145
+ raise "The sentence length value must be even" if sentence_limit.odd?
146
+
147
+ results = {}
148
+ regex = Regexp.new(text, Regexp::IGNORECASE)
149
+
150
+ @text.each do |sentence|
151
+ hits = sentence.scan(regex).count
152
+ if hits > 0
153
+ sentence.strip!
154
+ index = sentence.index(regex)
155
+ Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
156
+ results[sentence] = hits
157
+ end
158
+ end
159
+
160
+ return [] if results.empty?
161
+ results = Hash[results.sort_by { |k, v| v }]
162
+ results.keys.reverse
163
+ end
164
+
165
+ # Performs a text search (see search for details) but assigns the results
166
+ # to the @text instance variable. This can be used for sub search
167
+ # functionality. Note that there is no way of getting the original text
168
+ # back however.
169
+ def search!(text)
170
+ @text = search(text)
171
+ end
172
+
173
+ # Uses Nokogiri's xpath method to search the doc's html and return the
174
+ # results.
175
+ def xpath(xpath)
176
+ @doc.xpath(xpath)
177
+ end
178
+
179
+ private
180
+
181
+ def process_str(str)
182
+ str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
183
+ str.strip!
184
+ str # This is required to return the str, do not remove.
185
+ end
186
+
187
+ def process_arr(array)
188
+ assert_arr_types(array, String)
189
+ array.map! { |str| process_str(str) }
190
+ array.reject! { |str| str.empty? }
191
+ array.uniq!
192
+ end
193
+
194
+ # Modifies internal links by removing this doc's base or host url if
195
+ # present. http://www.google.co.uk/about.html (with or without the
196
+ # protocol prefix) will become about.html meaning it'll appear within
197
+ # internal_links.
198
+ def process_internal_links(links)
199
+ links.map! do |link|
200
+ host_or_base = if link.start_with?("http")
201
+ url.base
202
+ else
203
+ url.host
204
+ end
205
+ if link.start_with?(host_or_base)
206
+ link.sub!(host_or_base, "")
207
+ link.replace(link[1..-1]) if link.start_with?("/")
208
+ link.strip!
209
+ end
210
+ link
211
+ end
212
+ end
213
+
214
+ def text_elements_xpath
215
+ xpath = ""
216
+ return xpath if TEXT_ELEMENTS.empty?
217
+ el_xpath = "//%s/text()"
218
+ TEXT_ELEMENTS.each_with_index do |el, i|
219
+ xpath += " | " unless i == 0
220
+ xpath += el_xpath % [el]
221
+ end
222
+ xpath
223
+ end
224
+
225
+ def init_var(xpath, var, first_result = true)
226
+ results = @doc.xpath(xpath)
227
+ unless results.nil? || results.empty?
228
+ result = if first_result
229
+ results.first.content
230
+ else
231
+ results.map { |res| res.content }
232
+ end
233
+ instance_variable_set(var, result)
234
+ end
235
+ end
236
+
237
+ def init_title
238
+ @title = nil
239
+ xpath = "//title"
240
+ init_var(xpath, :@title)
241
+ process_str(@title) unless @title.nil?
242
+ end
243
+
244
+ def init_author
245
+ @author = nil
246
+ xpath = "//meta[@name='author']/@content"
247
+ init_var(xpath, :@author)
248
+ process_str(@author) unless @author.nil?
249
+ end
250
+
251
+ def init_keywords
252
+ @keywords = nil
253
+ xpath = "//meta[@name='keywords']/@content"
254
+ init_var(xpath, :@keywords)
255
+ return @keywords = [] unless @keywords
256
+ @keywords = @keywords.split(",")
257
+ process_arr(@keywords)
258
+ end
259
+
260
+ def init_links
261
+ @links = nil
262
+ xpath = "//a/@href"
263
+ init_var(xpath, :@links, false)
264
+ return @links = [] unless @links
265
+ process_arr(@links)
266
+ @links.reject! { |link| link == "/" }
267
+ @links.map! do |link|
268
+ begin
269
+ Wgit::Url.new(link)
270
+ rescue
271
+ nil
272
+ end
273
+ end
274
+ @links.reject! { |link| link.nil? }
275
+ process_internal_links(@links)
276
+ end
277
+
278
+ def init_text
279
+ @text = nil
280
+ xpath = text_elements_xpath
281
+ init_var(xpath, :@text, false)
282
+ return @text = [] unless @text
283
+ process_arr(@text)
284
+ end
285
+
286
+ alias :to_hash :to_h
287
+ alias :relative_links :internal_links
288
+ alias :relative_urls :internal_links
289
+ alias :relative_full_links :internal_full_links
290
+ alias :relative_full_urls :internal_full_links
291
+ alias :external_urls :external_links
292
+ end
293
+ end
data/lib/wgit/url.rb ADDED
@@ -0,0 +1,140 @@
1
+ require_relative 'utils'
2
+ require 'uri'
3
+
4
+ module Wgit
5
+
6
+ # @author Michael Telford
7
+ # Class modeling a web based URL.
8
+ # Can be an internal link e.g. "about.html"
9
+ # or a full URL e.g. "http://www.google.co.uk".
10
+ class Url < String
11
+ attr_accessor :crawled, :date_crawled
12
+
13
+ def initialize(url_or_doc, crawled = false, date_crawled = nil)
14
+ if (url_or_doc.is_a?(String))
15
+ url = url_or_doc
16
+ else
17
+ # Init from a mongo collection document.
18
+ url = url_or_doc[:url]
19
+ crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
20
+ date_crawled = url_or_doc[:date_crawled]
21
+ end
22
+ @uri = URI(url)
23
+ @crawled = crawled
24
+ @date_crawled = date_crawled
25
+ super(url)
26
+ end
27
+
28
+ def self.validate(url)
29
+ if Wgit::Url.relative_link?(url)
30
+ raise "Invalid url (or a relative link): #{url}"
31
+ end
32
+ unless url.start_with?("http://") or url.start_with?("https://")
33
+ raise "Invalid url (missing protocol prefix): #{url}"
34
+ end
35
+ if URI.regexp.match(url).nil?
36
+ raise "Invalid url: #{url}"
37
+ end
38
+ end
39
+
40
+ def self.valid?(url)
41
+ Wgit::Url.validate(url)
42
+ true
43
+ rescue
44
+ false
45
+ end
46
+
47
+ # Modifies the receiver url by prefixing it with a protocol.
48
+ # Returns the url whether its been modified or not.
49
+ def self.prefix_protocol(url, https = false)
50
+ unless url.start_with?("http://") or url.start_with?("https://")
51
+ if https
52
+ url.replace("https://#{url}")
53
+ else
54
+ url.replace("http://#{url}")
55
+ end
56
+ end
57
+ url
58
+ end
59
+
60
+ # URI.split("http://www.google.co.uk/about.html") returns the following:
61
+ # array[2]: "www.google.co.uk", array[5]: "/about.html".
62
+ # This means that all external links in a page are expected to have a
63
+ # protocol prefix e.g. "http://", otherwise the link is treated as an
64
+ # internal link (regardless of whether it is valid or not).
65
+ def self.relative_link?(link)
66
+ link_segs = URI.split(link)
67
+ if not link_segs[2].nil? and not link_segs[2].empty?
68
+ false
69
+ elsif not link_segs[5].nil? and not link_segs[5].empty?
70
+ true
71
+ else
72
+ raise "Invalid link: #{link}"
73
+ end
74
+ end
75
+
76
+ def self.concat(host, link)
77
+ url = host
78
+ url.chop! if url.end_with?("/")
79
+ link = link[1..-1] if link.start_with?("/")
80
+ Wgit::Url.new(url + "/" + link)
81
+ end
82
+
83
+ def relative_link?
84
+ Wgit::Url.relative_link?(self)
85
+ end
86
+
87
+ def valid?
88
+ Wgit::Url.valid?(self)
89
+ end
90
+
91
+ def concat(link)
92
+ Wgit::Url.concat(self, link)
93
+ end
94
+
95
+ def crawled=(bool)
96
+ @crawled = bool
97
+ @date_crawled = bool ? Wgit::Utils.time_stamp : nil
98
+ end
99
+
100
+ def to_uri
101
+ @uri
102
+ end
103
+
104
+ def to_url
105
+ self
106
+ end
107
+
108
+ # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
109
+ def to_host
110
+ Wgit::Url.new(@uri.host)
111
+ end
112
+
113
+ # URI.split("http://www.google.co.uk/about.html") returns the following:
114
+ # array[0]: "http://", array[2]: "www.google.co.uk".
115
+ # Returns array[0] + array[2] e.g. http://www.google.co.uk.
116
+ def to_base
117
+ if Wgit::Url.relative_link?(self)
118
+ raise "A relative link doesn't have a base URL: #{self}"
119
+ end
120
+ url_segs = URI.split(self)
121
+ if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
122
+ raise "Both a protocol and host are needed: #{self}"
123
+ end
124
+ base = "#{url_segs[0]}://#{url_segs[2]}"
125
+ Wgit::Url.new(base)
126
+ end
127
+
128
+ def to_h
129
+ ignore = [:@uri]
130
+ h = Wgit::Utils.to_h(self, ignore)
131
+ Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
132
+ end
133
+
134
+ alias :to_hash :to_h
135
+ alias :host :to_host
136
+ alias :base :to_base
137
+ alias :internal_link? :relative_link?
138
+ alias :crawled? :crawled
139
+ end
140
+ end
data/lib/wgit/utils.rb ADDED
@@ -0,0 +1,115 @@
1
+
2
+ module Wgit
3
+
4
+ # @author Michael Telford
5
+ # Utility module containing generic methods.
6
+ module Utils
7
+ def self.time_stamp
8
+ Time.new
9
+ end
10
+
11
+ # Returns a hash created from obj's instance vars and values.
12
+ def self.to_h(obj, ignore = [])
13
+ hash = {}
14
+ obj.instance_variables.each do |var|
15
+ next if ignore.include?(var)
16
+ hash[var[1..-1].to_sym] = obj.instance_variable_get(var)
17
+ end
18
+ hash
19
+ end
20
+
21
+ # Improved each method which takes care of singleton and enumerable
22
+ # objects. Yields one or more objects.
23
+ def self.each(obj_or_objs)
24
+ if obj_or_objs.respond_to?(:each)
25
+ obj_or_objs.each { |obj| yield obj }
26
+ else
27
+ yield obj_or_objs
28
+ end
29
+ end
30
+
31
+ # Formats the sentence (modifies the receiver) and returns its value.
32
+ # The length will be based on the sentence_limit parameter or the full
33
+ # length of the original sentence, which ever is less. The full sentence
34
+ # is returned if the sentence_limit is 0. The algorithm obviously ensures
35
+ # that the search value is visible somewhere in the sentence.
36
+ def self.format_sentence_length(sentence, index, sentence_limit)
37
+ raise "A sentence value must be provided" if sentence.empty?
38
+ raise "The sentence length value must be even" if sentence_limit.odd?
39
+ if index < 0 or index > sentence.length
40
+ raise "Incorrect index value: #{index}"
41
+ end
42
+
43
+ return sentence if sentence_limit == 0
44
+
45
+ start = 0
46
+ finish = sentence.length
47
+
48
+ if sentence.length > sentence_limit
49
+ start = index - (sentence_limit / 2)
50
+ finish = index + (sentence_limit / 2)
51
+
52
+ if start < 0
53
+ diff = 0 - start
54
+ if (finish + diff) > sentence.length
55
+ finish = sentence.length
56
+ else
57
+ finish += diff
58
+ end
59
+ start = 0
60
+ elsif finish > sentence.length
61
+ diff = finish - sentence.length
62
+ if (start - diff) < 0
63
+ start = 0
64
+ else
65
+ start -= diff
66
+ end
67
+ finish = sentence.length
68
+ end
69
+
70
+ raise if sentence[start..(finish - 1)].length != sentence_limit
71
+ end
72
+
73
+ sentence.replace(sentence[start..(finish - 1)])
74
+ end
75
+
76
+ # Prints out the search results in a search engine page format.
77
+ # Most of the params are passed to Document#search - see class docs.
78
+ # The steam param decides where the printf output is written to, and
79
+ # therefore must respond_to? :puts
80
+ # The format for each result is:
81
+ #
82
+ # Title
83
+ # Keywords (if there are some)
84
+ # Text Snippet (showing the searched for text if provided)
85
+ # Url
86
+ # <empty_line>
87
+ def self.printf_search_results(results, text = nil, case_sensitive = false,
88
+ sentence_length = 80, keyword_count = 5,
89
+ stream = Kernel)
90
+ raise "stream must respond_to? :puts" unless stream.respond_to? :puts
91
+ keyword_count -= 1 # Because Array's are zero indexed.
92
+
93
+ results.each do |doc|
94
+ sentence = if text.nil?
95
+ nil
96
+ else
97
+ sentence = doc.search(text, sentence_length).first
98
+ if sentence.nil?
99
+ nil
100
+ else
101
+ sentence.strip.empty? ? nil : sentence
102
+ end
103
+ end
104
+ stream.puts doc.title
105
+ unless doc.keywords.empty?
106
+ stream.puts doc.keywords[0..keyword_count].join(", ")
107
+ end
108
+ stream.puts sentence unless sentence.nil?
109
+ stream.puts doc.url
110
+ stream.puts
111
+ end
112
+ nil
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,3 @@
1
+ module Wgit
2
+ VERSION = "0.0.1".freeze
3
+ end
@@ -0,0 +1,134 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'crawler'
4
+ require_relative 'database/database'
5
+
6
+ # @author Michael Telford
7
+ module Wgit
8
+
9
+ # Convience method to crawl the World Wide Web.
10
+ # The default value (-1) for max_sites_to_crawl is unrestricted.
11
+ # The default max_data_size is 1GB.
12
+ def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
13
+ db = Wgit::Database.new
14
+ web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
15
+ web_crawler.crawl_the_web
16
+ end
17
+
18
+ # Class which sets up a crawler and saves the indexed
19
+ # docs to a database. Will crawl the web forever if you let it :-)
20
+ class WebCrawler
21
+ attr_accessor :max_sites_to_crawl, :max_data_size
22
+ attr_reader :crawler, :db
23
+
24
+ def initialize(database,
25
+ max_sites_to_crawl = -1,
26
+ max_data_size = 1048576000)
27
+ @crawler = Wgit::Crawler.new
28
+ @db = database
29
+ @max_sites_to_crawl = max_sites_to_crawl
30
+ @max_data_size = max_data_size
31
+ end
32
+
33
+ # Retrieves url's from the database and recursively crawls each site
34
+ # storing their internal pages into the database and adding their external
35
+ # url's to be crawled at a later date.
36
+ def crawl_the_web
37
+ if max_sites_to_crawl < 0
38
+ puts "Crawling until the database has been filled or it runs out of \
39
+ urls to crawl (which might be never)."
40
+ end
41
+ loop_count = 0
42
+
43
+ while keep_crawling?(loop_count) do
44
+ puts "Current database size: #{db.size}"
45
+ crawler.urls = db.uncrawled_urls
46
+
47
+ if crawler.urls.empty?
48
+ puts "No urls to crawl, exiting."
49
+ break
50
+ end
51
+ puts "Starting crawl loop for: #{crawler.urls}"
52
+
53
+ docs_count = 0
54
+ urls_count = 0
55
+
56
+ crawler.urls.each do |url|
57
+ unless keep_crawling?(loop_count)
58
+ puts "Reached max number of sites to crawl or database \
59
+ capacity, exiting."
60
+ return
61
+ end
62
+ loop_count += 1
63
+
64
+ url.crawled = true
65
+ raise unless db.update(url) == 1
66
+
67
+ site_docs_count = 0
68
+ ext_links = crawler.crawl_site(url) do |doc|
69
+ unless doc.empty?
70
+ if write_doc_to_db(doc)
71
+ docs_count += 1
72
+ site_docs_count += 1
73
+ end
74
+ end
75
+ end
76
+
77
+ urls_count += write_urls_to_db(ext_links)
78
+ puts "Crawled and saved #{site_docs_count} docs for the \
79
+ site: #{url}"
80
+ end
81
+
82
+ puts "Crawled and saved docs for #{docs_count} url(s) overall for \
83
+ this iteration."
84
+ puts "Found and saved #{urls_count} external url(s) for the next \
85
+ iteration."
86
+ end
87
+ end
88
+
89
+ private
90
+
91
+ # Keep crawling or not based on DB size and current loop interation.
92
+ def keep_crawling?(loop_count)
93
+ return false if db.size >= max_data_size
94
+ # If max_sites_to_crawl is -1 for example then crawl away.
95
+ if max_sites_to_crawl < 0
96
+ true
97
+ else
98
+ loop_count < max_sites_to_crawl
99
+ end
100
+ end
101
+
102
+ # The unique url index on the documents collection prevents duplicate
103
+ # inserts.
104
+ def write_doc_to_db(doc)
105
+ db.insert(doc)
106
+ puts "Saved document for url: #{doc.url}"
107
+ true
108
+ rescue Mongo::Error::OperationFailure
109
+ puts "Document already exists: #{doc.url}"
110
+ false
111
+ end
112
+
113
+ # The unique url index on the urls collection prevents duplicate inserts.
114
+ def write_urls_to_db(urls)
115
+ count = 0
116
+ if urls.respond_to?(:each)
117
+ urls.each do |url|
118
+ begin
119
+ db.insert(url)
120
+ count += 1
121
+ puts "Inserted url: #{url}"
122
+ rescue Mongo::Error::OperationFailure
123
+ puts "Url already exists: #{url}"
124
+ end
125
+ end
126
+ end
127
+ count
128
+ end
129
+ end
130
+ end
131
+
132
+ if __FILE__ == $0
133
+ Wgit.crawl_the_web
134
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wgit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Michael Telford
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-07 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves their
14
+ page contents for later use. Also included in this package is a means to search
15
+ indexed documents stored in a database. Therefore this library provides the main
16
+ components of a WWW search engine. You can also use Wgit to copy entire website's
17
+ HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
18
+ you to easily pull out the parts of a webpage that are important to you, the CSS
19
+ or JS links for example.
20
+ email: michael.telford@live.com
21
+ executables: []
22
+ extensions: []
23
+ extra_rdoc_files: []
24
+ files:
25
+ - "./lib/wgit.rb"
26
+ - "./lib/wgit/assertable.rb"
27
+ - "./lib/wgit/core_ext.rb"
28
+ - "./lib/wgit/crawler.rb"
29
+ - "./lib/wgit/database/database.rb"
30
+ - "./lib/wgit/database/model.rb"
31
+ - "./lib/wgit/database/mongo_connection_details.rb"
32
+ - "./lib/wgit/document.rb"
33
+ - "./lib/wgit/url.rb"
34
+ - "./lib/wgit/utils.rb"
35
+ - "./lib/wgit/version.rb"
36
+ - "./lib/wgit/web_crawler.rb"
37
+ homepage: http://rubygems.org/gems/wgit
38
+ licenses:
39
+ - MIT
40
+ metadata:
41
+ allowed_push_host: https://rubygems.org
42
+ post_install_message:
43
+ rdoc_options: []
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ requirements: []
57
+ rubyforge_project:
58
+ rubygems_version: 2.4.5
59
+ signing_key:
60
+ specification_version: 4
61
+ summary: Wgit is wget on steroids with an easy to use API.
62
+ test_files: []