wgit 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/wgit.rb +11 -0
- data/lib/wgit/assertable.rb +69 -0
- data/lib/wgit/core_ext.rb +40 -0
- data/lib/wgit/crawler.rb +132 -0
- data/lib/wgit/database/database.rb +269 -0
- data/lib/wgit/database/model.rb +31 -0
- data/lib/wgit/database/mongo_connection_details.rb +27 -0
- data/lib/wgit/document.rb +293 -0
- data/lib/wgit/url.rb +140 -0
- data/lib/wgit/utils.rb +115 -0
- data/lib/wgit/version.rb +3 -0
- data/lib/wgit/web_crawler.rb +134 -0
- metadata +62 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 82f33e00a273c6cdeb3ba9c171110d849fff2428
|
4
|
+
data.tar.gz: 14c63f826d1d21811b14e9f3a2bca750b3f4afa3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7c42b925f72d9e7cceba79d9aee764f97b6537c0005038501a1f75c36b1bcd3b6036cfb9b62fcf01fd435e0348c1e8c00c445a291051c068fa58184de2c9590a
|
7
|
+
data.tar.gz: a2a756c3be7b9b214921bfdac5846a2250e452265285cb9c3b812d2eaefc2ab969b608cd1841f34507a6ef184f20ba7c98658daf0135fb85eead88de0356320f
|
data/lib/wgit.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require_relative 'wgit/version'
|
2
|
+
require_relative 'wgit/crawler'
|
3
|
+
require_relative 'wgit/web_crawler'
|
4
|
+
require_relative 'wgit/url'
|
5
|
+
require_relative 'wgit/document'
|
6
|
+
require_relative 'wgit/utils'
|
7
|
+
require_relative 'wgit/assertable'
|
8
|
+
require_relative 'wgit/database/database'
|
9
|
+
require_relative 'wgit/database/model'
|
10
|
+
require_relative 'wgit/database/mongo_connection_details'
|
11
|
+
#require_relative 'wgit/core_ext'
|
@@ -0,0 +1,69 @@
|
|
1
|
+
|
2
|
+
module Wgit
|
3
|
+
|
4
|
+
# @author Michael Telford
|
5
|
+
# Module containing assert methods including type checking which can be used
|
6
|
+
# for asserting the integrity of method definitions etc.
|
7
|
+
module Assertable
|
8
|
+
DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
|
9
|
+
WRONG_METHOD_MSG = "arr must be Enumerable, use a different method"
|
10
|
+
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
11
|
+
|
12
|
+
# obj.instance_of? must return true for one of the types listed in
|
13
|
+
# type_or_types or an exception is thrown using msg if provided.
|
14
|
+
# type_or_types can be a single Class or an Enumerable of Class objects,
|
15
|
+
# Strings and Symbols will not work.
|
16
|
+
def assert_types(obj, type_or_types, msg = nil)
|
17
|
+
msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
|
18
|
+
if type_or_types.respond_to?(:any?)
|
19
|
+
match = type_or_types.any? { |type| obj.instance_of?(type) }
|
20
|
+
else
|
21
|
+
match = obj.instance_of?(type_or_types)
|
22
|
+
end
|
23
|
+
raise msg unless match
|
24
|
+
obj
|
25
|
+
end
|
26
|
+
|
27
|
+
# Each object within arr must match one of the types listed in
|
28
|
+
# type_or_types or an exception is thrown using msg if provided.
|
29
|
+
# type_or_types can be a single Class or an Enumerable of Class objects,
|
30
|
+
# Strings and Symbols will not work.
|
31
|
+
def assert_arr_types(arr, type_or_types, msg = nil)
|
32
|
+
raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
|
33
|
+
arr.each do |obj|
|
34
|
+
assert_types(obj, type_or_types, msg)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# The obj_or_objs must respond_to? all of the given methods or an
|
39
|
+
# Exception is raised using msg or a default message.
|
40
|
+
# Returns obj_or_objs on sucessful assertion.
|
41
|
+
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
42
|
+
if obj_or_objs.respond_to?(:each)
|
43
|
+
obj_or_objs.each do |obj|
|
44
|
+
_assert_respond_to(obj, methods, msg)
|
45
|
+
end
|
46
|
+
else
|
47
|
+
_assert_respond_to(obj_or_objs, methods, msg)
|
48
|
+
end
|
49
|
+
obj_or_objs
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def _assert_respond_to(obj, methods, msg = nil)
|
55
|
+
msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
|
56
|
+
match = methods.all? { |method| obj.respond_to?(method) }
|
57
|
+
raise msg unless match
|
58
|
+
obj
|
59
|
+
end
|
60
|
+
|
61
|
+
alias :assert_type :assert_types
|
62
|
+
alias :type :assert_types
|
63
|
+
alias :types :assert_types
|
64
|
+
alias :assert_arr_type :assert_arr_types
|
65
|
+
alias :arr_type :assert_arr_types
|
66
|
+
alias :arr_types :assert_arr_types
|
67
|
+
alias :respond_to :assert_respond_to
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require_relative 'url'
|
2
|
+
|
3
|
+
# @author Michael Telford
|
4
|
+
# Script which extends Ruby's core functionality when parsed.
|
5
|
+
# Needs to be required separately using `require 'wgit/core_ext'`.
|
6
|
+
|
7
|
+
class String
|
8
|
+
# Converts a String into a Wgit::Url object.
|
9
|
+
def to_url
|
10
|
+
Wgit::Url.new(self)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
module Enumerable
|
15
|
+
# Converts each String instance into a Wgit::Url object and returns the new
|
16
|
+
# array.
|
17
|
+
def to_urls
|
18
|
+
map do |element|
|
19
|
+
process_url_element(element)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Converts each String instance into a Wgit::Url object and returns the
|
24
|
+
# updated array.
|
25
|
+
def to_urls!
|
26
|
+
map! do |element|
|
27
|
+
process_url_element(element)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def process_url_element(element)
|
35
|
+
if element.is_a? String
|
36
|
+
element.to_url
|
37
|
+
else
|
38
|
+
element
|
39
|
+
end
|
40
|
+
end
|
data/lib/wgit/crawler.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
require_relative 'url'
|
2
|
+
require_relative 'document'
|
3
|
+
require_relative 'utils'
|
4
|
+
require_relative 'assertable'
|
5
|
+
require 'net/http' # requires 'uri'
|
6
|
+
|
7
|
+
module Wgit
|
8
|
+
|
9
|
+
# @author Michael Telford
|
10
|
+
# Crawler class provides a means of crawling web URL's.
|
11
|
+
# Note that any redirects will not be followed for during crawling
|
12
|
+
# functionality.
|
13
|
+
class Crawler
|
14
|
+
include Assertable
|
15
|
+
|
16
|
+
attr_reader :urls, :docs
|
17
|
+
|
18
|
+
def initialize(*urls)
|
19
|
+
self.urls = urls unless urls.nil?
|
20
|
+
@docs = []
|
21
|
+
end
|
22
|
+
|
23
|
+
def urls=(urls)
|
24
|
+
@urls = []
|
25
|
+
Wgit::Utils.each(urls) { |url| add_url(url) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def [](*urls)
|
29
|
+
self.urls = urls unless urls.nil?
|
30
|
+
end
|
31
|
+
|
32
|
+
def <<(url)
|
33
|
+
add_url(url)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Crawls individual urls, not entire sites.
|
37
|
+
# Returns the last crawled doc.
|
38
|
+
# Yields each doc to the provided block or adds each doc to @docs
|
39
|
+
# which can be accessed by Crawler#docs after the method returns.
|
40
|
+
def crawl_urls(urls = @urls, &block)
|
41
|
+
raise "No urls to crawl" unless urls
|
42
|
+
@docs = []
|
43
|
+
doc = nil
|
44
|
+
Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
|
45
|
+
doc ? doc : @docs.last
|
46
|
+
end
|
47
|
+
|
48
|
+
# Crawl the url and return the response document or nil.
|
49
|
+
# Also yield(doc) if a block is provided. The doc is passed to the block
|
50
|
+
# regardless of the crawl success so the doc.url can be used if needed.
|
51
|
+
def crawl_url(url = @urls.first, &block)
|
52
|
+
assert_type(url, Url)
|
53
|
+
markup = fetch(url)
|
54
|
+
url.crawled = true
|
55
|
+
doc = Wgit::Document.new(url, markup)
|
56
|
+
block.call(doc) if block_given?
|
57
|
+
doc.empty? ? nil : doc
|
58
|
+
end
|
59
|
+
|
60
|
+
# Crawls an entire site by recursively going through its internal_links.
|
61
|
+
# Also yield(doc) for each crawled doc if a block is provided.
|
62
|
+
# A block is the only way to interact with the crawled docs.
|
63
|
+
# Returns a unique array of external urls collected from the site
|
64
|
+
# or nil if the base_url could not be crawled successfully.
|
65
|
+
def crawl_site(base_url = @urls.first, &block)
|
66
|
+
assert_type(base_url, Url)
|
67
|
+
|
68
|
+
doc = crawl_url(base_url, &block)
|
69
|
+
return nil if doc.nil?
|
70
|
+
|
71
|
+
crawled_urls = []
|
72
|
+
external_urls = doc.external_links
|
73
|
+
internal_urls = doc.internal_links
|
74
|
+
|
75
|
+
return doc.external_links.uniq if internal_urls.empty?
|
76
|
+
|
77
|
+
loop do
|
78
|
+
internal_urls.uniq! unless internal_urls.uniq.nil?
|
79
|
+
|
80
|
+
links = internal_urls - crawled_urls
|
81
|
+
break if links.empty?
|
82
|
+
|
83
|
+
links.each do |link|
|
84
|
+
doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
|
85
|
+
crawled_urls << link
|
86
|
+
next if doc.nil?
|
87
|
+
internal_urls.concat(doc.internal_links)
|
88
|
+
external_urls.concat(doc.external_links)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
external_urls.uniq
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
# Add the document to the @docs array for later processing
|
98
|
+
# or let the block process it here and now.
|
99
|
+
def handle_crawl_block(url, &block)
|
100
|
+
if not block_given?
|
101
|
+
@docs << crawl_url(url)
|
102
|
+
nil
|
103
|
+
else
|
104
|
+
crawl_url(url, &block)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# The fetch method performs a HTTP GET to obtain the HTML document.
|
109
|
+
# Invalid urls or any HTTP response that doesn't return a HTML body
|
110
|
+
# will be ignored and nil will be returned. This means that redirects
|
111
|
+
# etc. will not be followed.
|
112
|
+
def fetch(url)
|
113
|
+
raise unless url.respond_to?(:to_uri)
|
114
|
+
res = Net::HTTP.get_response(url.to_uri)
|
115
|
+
res.body.empty? ? nil : res.body
|
116
|
+
rescue
|
117
|
+
nil
|
118
|
+
end
|
119
|
+
|
120
|
+
def add_url(url)
|
121
|
+
@urls = [] if @urls.nil?
|
122
|
+
if url.instance_of?(Url)
|
123
|
+
@urls << url
|
124
|
+
else
|
125
|
+
@urls << Wgit::Url.new(url)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
alias :crawl :crawl_urls
|
130
|
+
alias :crawl_r :crawl_site
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
require_relative '../document'
|
2
|
+
require_relative '../url'
|
3
|
+
require_relative '../utils'
|
4
|
+
require_relative '../assertable'
|
5
|
+
require_relative 'mongo_connection_details'
|
6
|
+
require_relative 'model'
|
7
|
+
require 'mongo'
|
8
|
+
|
9
|
+
module Wgit
|
10
|
+
|
11
|
+
# @author Michael Telford
|
12
|
+
# Class modeling a DB connection and CRUD operations for the Url and
|
13
|
+
# Document collections.
|
14
|
+
# The most common methods are: insert, update, urls, search, stats, size.
|
15
|
+
class Database
|
16
|
+
include Assertable
|
17
|
+
|
18
|
+
# Is relative to the root project folder, not this file.
|
19
|
+
LOG_FILE_PATH = "misc/mongo_log.txt"
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
conn_details = Wgit::CONNECTION_DETAILS
|
23
|
+
if conn_details.empty?
|
24
|
+
raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
|
25
|
+
:port, :db, :uname, :pword for a database connection to be established."
|
26
|
+
end
|
27
|
+
|
28
|
+
logger = Logger.new(LOG_FILE_PATH)
|
29
|
+
address = "#{conn_details[:host]}:#{conn_details[:port]}"
|
30
|
+
@@client = Mongo::Client.new([address],
|
31
|
+
:database => conn_details[:db],
|
32
|
+
:user => conn_details[:uname],
|
33
|
+
:password => conn_details[:pword],
|
34
|
+
:logger => logger,
|
35
|
+
:truncate_logs => false)
|
36
|
+
end
|
37
|
+
|
38
|
+
### Create Data ###
|
39
|
+
|
40
|
+
def insert(data)
|
41
|
+
if data.is_a?(Url)
|
42
|
+
insert_urls(data)
|
43
|
+
elsif data.is_a?(Document)
|
44
|
+
insert_docs(data)
|
45
|
+
elsif data.respond_to?(:first)
|
46
|
+
if data.first.is_a?(Url)
|
47
|
+
insert_urls(data)
|
48
|
+
else
|
49
|
+
insert_docs(data)
|
50
|
+
end
|
51
|
+
else
|
52
|
+
raise "data is not in the correct format (all Url's or Document's)"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def insert_urls(url_or_urls)
|
57
|
+
unless url_or_urls.respond_to?(:map)
|
58
|
+
assert_type(url_or_urls, Url)
|
59
|
+
url_or_urls = Wgit::Model.url(url_or_urls)
|
60
|
+
else
|
61
|
+
assert_arr_types(url_or_urls, Url)
|
62
|
+
url_or_urls = url_or_urls.map do |url|
|
63
|
+
Wgit::Model.url(url)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
create(:urls, url_or_urls)
|
67
|
+
end
|
68
|
+
|
69
|
+
def insert_docs(doc_or_docs)
|
70
|
+
unless doc_or_docs.respond_to?(:map)
|
71
|
+
assert_type(doc_or_docs, [Document, Hash])
|
72
|
+
unless doc_or_docs.is_a?(Hash)
|
73
|
+
doc_or_docs = Wgit::Model.document(doc_or_docs)
|
74
|
+
end
|
75
|
+
else
|
76
|
+
assert_arr_types(doc_or_docs, [Document, Hash])
|
77
|
+
doc_or_docs = doc_or_docs.map do |doc|
|
78
|
+
Wgit::Model.document(doc) unless doc.is_a?(Hash)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
create(:documents, doc_or_docs)
|
82
|
+
end
|
83
|
+
|
84
|
+
### Retrieve Data ###
|
85
|
+
|
86
|
+
# A crawled parameter value of nil (the default) returns all urls.
|
87
|
+
# A limit of 0 means all urls are returned.
|
88
|
+
# All urls are sorted by date_added ascending, in other words the first
|
89
|
+
# url in the results is the first added.
|
90
|
+
def urls(crawled = nil, limit = 0, skip = 0, &block)
|
91
|
+
crawled.nil? ? query = {} : query = { :crawled => crawled }
|
92
|
+
|
93
|
+
sort = { :date_added => 1 }
|
94
|
+
results = retrieve(:urls, query, sort, {}, limit, skip)
|
95
|
+
return [] if results.count < 1
|
96
|
+
|
97
|
+
# results.respond_to? :map! is false so we use map and overwrite the var.
|
98
|
+
results = results.map { |url_doc| Wgit::Url.new(url_doc) }
|
99
|
+
return results unless block_given?
|
100
|
+
results.each { |url| block.call(url) }
|
101
|
+
end
|
102
|
+
|
103
|
+
def crawled_urls(limit = 0, skip = 0, &block)
|
104
|
+
urls(true, limit, skip, &block)
|
105
|
+
end
|
106
|
+
|
107
|
+
def uncrawled_urls(limit = 0, skip = 0, &block)
|
108
|
+
urls(false, limit, skip, &block)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Currently all searches are case insensitive.
|
112
|
+
#
|
113
|
+
# Searches against the indexed docs in the DB for the given text.
|
114
|
+
# The searched fields are decided by the text index setup against the
|
115
|
+
# documents collection. Currently we search against the following fields:
|
116
|
+
# "author", "keywords", "title" and "text".
|
117
|
+
#
|
118
|
+
# The MongoDB search ranks/sorts the results in order (highest first) based
|
119
|
+
# upon each documents textScore which records the number of text hits. We
|
120
|
+
# then store this textScore in each Document object for use elsewhere if
|
121
|
+
# needed.
|
122
|
+
#
|
123
|
+
# @param text [String] the value to search the data against.
|
124
|
+
# @param whole_sentence [Boolean] whether multiple words should be
|
125
|
+
# searched for separately.
|
126
|
+
# @param limit [Fixnum] the max length/count of the results array.
|
127
|
+
# @param skip [Fixnum] the number of results to skip, starting with the
|
128
|
+
# most relevant based upon the textScore of the search.
|
129
|
+
# @param block [Block] a block which if provided is passed to each result.
|
130
|
+
#
|
131
|
+
# @return [Array] of Document objects representing the search results.
|
132
|
+
def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
|
133
|
+
text.strip!
|
134
|
+
text.replace("\"" + text + "\"") if whole_sentence
|
135
|
+
|
136
|
+
# The textScore sorts based on the most search hits.
|
137
|
+
# We use the textScore hash as a sort and a projection below.
|
138
|
+
# :$caseSensitive => case_sensitive, # 3.2+ only.
|
139
|
+
sort_proj = { :score => { :$meta => "textScore" } }
|
140
|
+
query = { :$text => { :$search => text } }
|
141
|
+
results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
|
142
|
+
|
143
|
+
return [] if results.count < 1
|
144
|
+
# results.respond_to? :map! is false so we use map and overwrite the var.
|
145
|
+
results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
|
146
|
+
return results unless block_given?
|
147
|
+
results.each { |doc| block.call(doc) }
|
148
|
+
end
|
149
|
+
|
150
|
+
# Performs a search and pretty prints the results.
|
151
|
+
def search_p(text, whole_sentence = false, limit = 10,
|
152
|
+
skip = 0, sentence_length = 80, &block)
|
153
|
+
results = search(text, whole_sentence, limit, skip, &block)
|
154
|
+
Wgit::Utils.printf_search_results(results, text, false, sentence_length)
|
155
|
+
end
|
156
|
+
|
157
|
+
# Returns a Mongo object which can be used like a Hash to retrieve values.
|
158
|
+
def stats
|
159
|
+
@@client.command(:dbStats => 0).documents[0]
|
160
|
+
end
|
161
|
+
|
162
|
+
def size
|
163
|
+
stats[:dataSize]
|
164
|
+
end
|
165
|
+
|
166
|
+
### Update Data ###
|
167
|
+
|
168
|
+
def update(data)
|
169
|
+
if data.is_a?(Url)
|
170
|
+
update_url(data)
|
171
|
+
elsif data.is_a?(Document)
|
172
|
+
update_doc(data)
|
173
|
+
else
|
174
|
+
raise "data is not in the correct format (all Url's or Document's)"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def update_url(url)
|
179
|
+
assert_type(url, Url)
|
180
|
+
selection = { :url => url }
|
181
|
+
url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
|
182
|
+
update = { "$set" => url_hash }
|
183
|
+
_update(true, :urls, selection, update)
|
184
|
+
end
|
185
|
+
|
186
|
+
def update_doc(doc)
|
187
|
+
assert_type(doc, Document)
|
188
|
+
selection = { :url => doc.url }
|
189
|
+
doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
|
190
|
+
update = { "$set" => doc_hash }
|
191
|
+
_update(true, :documents, selection, update)
|
192
|
+
end
|
193
|
+
|
194
|
+
private
|
195
|
+
|
196
|
+
def write_succeeded?(result, count = 1, multi = false)
|
197
|
+
case result.class.to_s
|
198
|
+
# Single create result.
|
199
|
+
when "Mongo::Operation::Write::Insert::Result"
|
200
|
+
result.documents.first[:err].nil?
|
201
|
+
# Multiple create result.
|
202
|
+
when "Mongo::BulkWrite::Result"
|
203
|
+
result.inserted_count == count
|
204
|
+
# Single and multiple update result.
|
205
|
+
when "Mongo::Operation::Write::Update::Result", # MongoDB 3.0
|
206
|
+
"Mongo::Operation::Write::Update::LegacyResult" # MongoDB 2.4
|
207
|
+
if multi
|
208
|
+
result.n == count
|
209
|
+
else
|
210
|
+
result.documents.first[:err].nil?
|
211
|
+
end
|
212
|
+
else
|
213
|
+
raise "Result class not currently supported: #{result.class.to_s}"
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def create(collection, data)
|
218
|
+
assert_type(data, [Hash, Array])
|
219
|
+
# Single doc.
|
220
|
+
if data.is_a?(Hash)
|
221
|
+
data.merge!(Wgit::Model.common_insert_data)
|
222
|
+
result = @@client[collection.to_sym].insert_one(data)
|
223
|
+
unless write_succeeded?(result)
|
224
|
+
raise "DB write (insert) failed"
|
225
|
+
end
|
226
|
+
result.n
|
227
|
+
# Multiple docs.
|
228
|
+
elsif data.is_a?(Array)
|
229
|
+
assert_arr_types(data, Hash)
|
230
|
+
data.map! do |data_hash|
|
231
|
+
data_hash.merge(Wgit::Model.common_insert_data)
|
232
|
+
end
|
233
|
+
result = @@client[collection.to_sym].insert_many(data)
|
234
|
+
unless write_succeeded?(result, data.length)
|
235
|
+
raise "DB write(s) failed"
|
236
|
+
end
|
237
|
+
result.inserted_count
|
238
|
+
else
|
239
|
+
raise "data must be a Hash or an Array of Hash's"
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def retrieve(collection, query, sort = {}, projection = {},
|
244
|
+
limit = 0, skip = 0)
|
245
|
+
assert_type(query, Hash)
|
246
|
+
@@client[collection.to_sym].find(query).projection(projection)
|
247
|
+
.skip(skip).limit(limit).sort(sort)
|
248
|
+
end
|
249
|
+
|
250
|
+
# NOTE: The Model.common_update_data should be merged in the calling
|
251
|
+
# method as the update param can be bespoke due to its nature.
|
252
|
+
def _update(single, collection, selection, update)
|
253
|
+
assert_arr_types([selection, update], Hash)
|
254
|
+
if single
|
255
|
+
result = @@client[collection.to_sym].update_one(selection, update)
|
256
|
+
else
|
257
|
+
result = @@client[collection.to_sym].update_many(selection, update)
|
258
|
+
end
|
259
|
+
raise "DB write (update) failed" unless write_succeeded?(result)
|
260
|
+
result.n
|
261
|
+
end
|
262
|
+
|
263
|
+
alias :count :size
|
264
|
+
alias :length :size
|
265
|
+
alias :insert_url :insert_urls
|
266
|
+
alias :insert_doc :insert_docs
|
267
|
+
alias :search_and_format :search_p
|
268
|
+
end
|
269
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative '../utils'
|
2
|
+
|
3
|
+
module Wgit
|
4
|
+
|
5
|
+
# @author Michael Telford
|
6
|
+
# Module containing the DB data model structure.
|
7
|
+
module Model
|
8
|
+
def self.url(url)
|
9
|
+
raise "url must respond to to_h" unless url.respond_to?(:to_h)
|
10
|
+
url.to_h
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.document(doc)
|
14
|
+
raise "doc must respond to to_h" unless doc.respond_to?(:to_h)
|
15
|
+
doc.to_h(false)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.common_insert_data
|
19
|
+
{
|
20
|
+
:date_added => Wgit::Utils.time_stamp,
|
21
|
+
:date_modified => Wgit::Utils.time_stamp,
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.common_update_data
|
26
|
+
{
|
27
|
+
:date_modified => Wgit::Utils.time_stamp,
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
# @author Michael Telford
|
3
|
+
module Wgit
|
4
|
+
DB_PROVIDER = :MongoLabs.freeze
|
5
|
+
|
6
|
+
# OpenShift (MongoDB 2.4)
|
7
|
+
if DB_PROVIDER == :OpenShift
|
8
|
+
CONNECTION_DETAILS = {
|
9
|
+
:host => "127.0.0.1",
|
10
|
+
:port => "27017",
|
11
|
+
:db => "admin",
|
12
|
+
:uname => "admin",
|
13
|
+
:pword => "R5jUKv1fessb"
|
14
|
+
}.freeze
|
15
|
+
# MongoLabs (MongoDB 3.0)
|
16
|
+
elsif DB_PROVIDER == :MongoLabs
|
17
|
+
CONNECTION_DETAILS = {
|
18
|
+
:host => "ds037205.mongolab.com",
|
19
|
+
:port => "37205",
|
20
|
+
:db => "crawler",
|
21
|
+
:uname => "rubyapp",
|
22
|
+
:pword => "R5jUKv1fessb",
|
23
|
+
}.freeze
|
24
|
+
else
|
25
|
+
raise "Database provider '#{DB_PROVIDER}' is not recognized"
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,293 @@
|
|
1
|
+
require_relative 'url'
|
2
|
+
require_relative 'utils'
|
3
|
+
require_relative 'assertable'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
module Wgit
|
7
|
+
|
8
|
+
# @author Michael Telford
|
9
|
+
# Class modeling a HTML web document. Also doubles as a search result.
|
10
|
+
class Document
|
11
|
+
include Assertable
|
12
|
+
|
13
|
+
TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
|
14
|
+
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
|
15
|
+
|
16
|
+
attr_reader :url, :html, :title, :author, :keywords, :links, :text, :score
|
17
|
+
|
18
|
+
def initialize(url_or_doc, html = nil)
|
19
|
+
if (url_or_doc.is_a?(String))
|
20
|
+
assert_type(url_or_doc, Url)
|
21
|
+
html ||= ""
|
22
|
+
|
23
|
+
@url = url_or_doc
|
24
|
+
@html = html
|
25
|
+
|
26
|
+
@doc = Nokogiri::HTML(html) do |config|
|
27
|
+
# TODO: Remove #'s below when crawling in production.
|
28
|
+
#config.options = Nokogiri::XML::ParseOptions::STRICT |
|
29
|
+
# Nokogiri::XML::ParseOptions::NONET
|
30
|
+
end
|
31
|
+
|
32
|
+
init_title
|
33
|
+
init_author
|
34
|
+
init_keywords
|
35
|
+
init_links
|
36
|
+
init_text
|
37
|
+
@score = 0.0
|
38
|
+
else
|
39
|
+
# Init from a mongo collection document.
|
40
|
+
@url = Wgit::Url.new(url_or_doc[:url])
|
41
|
+
@html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
|
42
|
+
@title = url_or_doc[:title]
|
43
|
+
@author = url_or_doc[:author]
|
44
|
+
@keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
|
45
|
+
@links = url_or_doc[:links].nil? ? [] : url_or_doc[:links]
|
46
|
+
@links.map! { |link| Wgit::Url.new(link) }
|
47
|
+
@text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
|
48
|
+
@score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def internal_links
|
53
|
+
return [] if @links.empty?
|
54
|
+
@links.reject do |link|
|
55
|
+
begin
|
56
|
+
not link.relative_link?
|
57
|
+
rescue
|
58
|
+
true
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def internal_full_links
|
64
|
+
return [] if internal_links.empty?
|
65
|
+
internal_links.map do |link|
|
66
|
+
link.replace("/" + link) unless link.start_with?("/")
|
67
|
+
Wgit::Url.new(@url.to_base + link)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def external_links
|
72
|
+
return [] if @links.empty?
|
73
|
+
@links.reject do |link|
|
74
|
+
begin
|
75
|
+
link.relative_link?
|
76
|
+
rescue
|
77
|
+
true
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def stats
|
83
|
+
hash = {}
|
84
|
+
instance_variables.each do |var|
|
85
|
+
# Add up the total bytes of text as well as the length.
|
86
|
+
if var == :@text
|
87
|
+
count = 0
|
88
|
+
@text.each { |t| count += t.length }
|
89
|
+
hash[:text_length] = @text.length
|
90
|
+
hash[:text_bytes] = count
|
91
|
+
# Else take the #length method return value.
|
92
|
+
else
|
93
|
+
next unless instance_variable_get(var).respond_to?(:length)
|
94
|
+
hash[var[1..-1].to_sym] =
|
95
|
+
instance_variable_get(var).send(:length)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
hash
|
99
|
+
end
|
100
|
+
|
101
|
+
def size
|
102
|
+
stats[:html]
|
103
|
+
end
|
104
|
+
|
105
|
+
def to_h(include_html = false)
|
106
|
+
ignore = include_html ? [] : [:@html]
|
107
|
+
ignore << :@doc # Always ignore :@doc
|
108
|
+
Wgit::Utils.to_h(self, ignore)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Override of the default == method, is equal if url and html both match.
|
112
|
+
# Use doc.object_id == other_doc.object_id for exact object comparison.
|
113
|
+
def ==(other_doc)
|
114
|
+
return false unless other_doc.is_a? Wgit::Document
|
115
|
+
url == other_doc.url and html == other_doc.html
|
116
|
+
end
|
117
|
+
|
118
|
+
# Shortcut for calling Document#html[range].
|
119
|
+
def [](range)
|
120
|
+
html[range]
|
121
|
+
end
|
122
|
+
|
123
|
+
def empty?
|
124
|
+
html.strip.empty?
|
125
|
+
end
|
126
|
+
|
127
|
+
# Searches against the Document#text for the given search text.
|
128
|
+
# The number of search hits for each sentenence are recorded internally
|
129
|
+
# and used to rank/sort the search results before being returned. Where
|
130
|
+
# the Database#search method search all documents for the most hits this
|
131
|
+
# method searches each documents text for the most hits.
|
132
|
+
#
|
133
|
+
# Each search result comprises of a sentence of a given length. The length
|
134
|
+
# will be based on the sentence_limit parameter or the full length of the
|
135
|
+
# original sentence, which ever is less. The algorithm obviously ensures
|
136
|
+
# that the search value is visible somewhere in the sentence.
|
137
|
+
#
|
138
|
+
# @param text [String] the value to search the document text against.
|
139
|
+
# @param sentence_limit [Fixnum] the length of each search result
|
140
|
+
# sentence.
|
141
|
+
#
|
142
|
+
# @return [Array] of String objects representing the search results.
|
143
|
+
def search(text, sentence_limit = 80)
|
144
|
+
raise "A search value must be provided" if text.empty?
|
145
|
+
raise "The sentence length value must be even" if sentence_limit.odd?
|
146
|
+
|
147
|
+
results = {}
|
148
|
+
regex = Regexp.new(text, Regexp::IGNORECASE)
|
149
|
+
|
150
|
+
@text.each do |sentence|
|
151
|
+
hits = sentence.scan(regex).count
|
152
|
+
if hits > 0
|
153
|
+
sentence.strip!
|
154
|
+
index = sentence.index(regex)
|
155
|
+
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
156
|
+
results[sentence] = hits
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
return [] if results.empty?
|
161
|
+
results = Hash[results.sort_by { |k, v| v }]
|
162
|
+
results.keys.reverse
|
163
|
+
end
|
164
|
+
|
165
|
+
# Performs a text search (see search for details) but assigns the results
|
166
|
+
# to the @text instance variable. This can be used for sub search
|
167
|
+
# functionality. Note that there is no way of getting the original text
|
168
|
+
# back however.
|
169
|
+
def search!(text)
|
170
|
+
@text = search(text)
|
171
|
+
end
|
172
|
+
|
173
|
+
# Uses Nokogiri's xpath method to search the doc's html and return the
|
174
|
+
# results.
|
175
|
+
def xpath(xpath)
|
176
|
+
@doc.xpath(xpath)
|
177
|
+
end
|
178
|
+
|
179
|
+
private
|
180
|
+
|
181
|
+
def process_str(str)
|
182
|
+
str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
183
|
+
str.strip!
|
184
|
+
str # This is required to return the str, do not remove.
|
185
|
+
end
|
186
|
+
|
187
|
+
def process_arr(array)
|
188
|
+
assert_arr_types(array, String)
|
189
|
+
array.map! { |str| process_str(str) }
|
190
|
+
array.reject! { |str| str.empty? }
|
191
|
+
array.uniq!
|
192
|
+
end
|
193
|
+
|
194
|
+
# Modifies internal links by removing this doc's base or host url if
|
195
|
+
# present. http://www.google.co.uk/about.html (with or without the
|
196
|
+
# protocol prefix) will become about.html meaning it'll appear within
|
197
|
+
# internal_links.
|
198
|
+
def process_internal_links(links)
|
199
|
+
links.map! do |link|
|
200
|
+
host_or_base = if link.start_with?("http")
|
201
|
+
url.base
|
202
|
+
else
|
203
|
+
url.host
|
204
|
+
end
|
205
|
+
if link.start_with?(host_or_base)
|
206
|
+
link.sub!(host_or_base, "")
|
207
|
+
link.replace(link[1..-1]) if link.start_with?("/")
|
208
|
+
link.strip!
|
209
|
+
end
|
210
|
+
link
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def text_elements_xpath
|
215
|
+
xpath = ""
|
216
|
+
return xpath if TEXT_ELEMENTS.empty?
|
217
|
+
el_xpath = "//%s/text()"
|
218
|
+
TEXT_ELEMENTS.each_with_index do |el, i|
|
219
|
+
xpath += " | " unless i == 0
|
220
|
+
xpath += el_xpath % [el]
|
221
|
+
end
|
222
|
+
xpath
|
223
|
+
end
|
224
|
+
|
225
|
+
def init_var(xpath, var, first_result = true)
|
226
|
+
results = @doc.xpath(xpath)
|
227
|
+
unless results.nil? || results.empty?
|
228
|
+
result = if first_result
|
229
|
+
results.first.content
|
230
|
+
else
|
231
|
+
results.map { |res| res.content }
|
232
|
+
end
|
233
|
+
instance_variable_set(var, result)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def init_title
|
238
|
+
@title = nil
|
239
|
+
xpath = "//title"
|
240
|
+
init_var(xpath, :@title)
|
241
|
+
process_str(@title) unless @title.nil?
|
242
|
+
end
|
243
|
+
|
244
|
+
def init_author
|
245
|
+
@author = nil
|
246
|
+
xpath = "//meta[@name='author']/@content"
|
247
|
+
init_var(xpath, :@author)
|
248
|
+
process_str(@author) unless @author.nil?
|
249
|
+
end
|
250
|
+
|
251
|
+
def init_keywords
|
252
|
+
@keywords = nil
|
253
|
+
xpath = "//meta[@name='keywords']/@content"
|
254
|
+
init_var(xpath, :@keywords)
|
255
|
+
return @keywords = [] unless @keywords
|
256
|
+
@keywords = @keywords.split(",")
|
257
|
+
process_arr(@keywords)
|
258
|
+
end
|
259
|
+
|
260
|
+
def init_links
|
261
|
+
@links = nil
|
262
|
+
xpath = "//a/@href"
|
263
|
+
init_var(xpath, :@links, false)
|
264
|
+
return @links = [] unless @links
|
265
|
+
process_arr(@links)
|
266
|
+
@links.reject! { |link| link == "/" }
|
267
|
+
@links.map! do |link|
|
268
|
+
begin
|
269
|
+
Wgit::Url.new(link)
|
270
|
+
rescue
|
271
|
+
nil
|
272
|
+
end
|
273
|
+
end
|
274
|
+
@links.reject! { |link| link.nil? }
|
275
|
+
process_internal_links(@links)
|
276
|
+
end
|
277
|
+
|
278
|
+
def init_text
|
279
|
+
@text = nil
|
280
|
+
xpath = text_elements_xpath
|
281
|
+
init_var(xpath, :@text, false)
|
282
|
+
return @text = [] unless @text
|
283
|
+
process_arr(@text)
|
284
|
+
end
|
285
|
+
|
286
|
+
alias :to_hash :to_h
|
287
|
+
alias :relative_links :internal_links
|
288
|
+
alias :relative_urls :internal_links
|
289
|
+
alias :relative_full_links :internal_full_links
|
290
|
+
alias :relative_full_urls :internal_full_links
|
291
|
+
alias :external_urls :external_links
|
292
|
+
end
|
293
|
+
end
|
data/lib/wgit/url.rb
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
require_relative 'utils'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Wgit
|
5
|
+
|
6
|
+
# @author Michael Telford
|
7
|
+
# Class modeling a web based URL.
|
8
|
+
# Can be an internal link e.g. "about.html"
|
9
|
+
# or a full URL e.g. "http://www.google.co.uk".
|
10
|
+
class Url < String
|
11
|
+
attr_accessor :crawled, :date_crawled
|
12
|
+
|
13
|
+
def initialize(url_or_doc, crawled = false, date_crawled = nil)
|
14
|
+
if (url_or_doc.is_a?(String))
|
15
|
+
url = url_or_doc
|
16
|
+
else
|
17
|
+
# Init from a mongo collection document.
|
18
|
+
url = url_or_doc[:url]
|
19
|
+
crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
|
20
|
+
date_crawled = url_or_doc[:date_crawled]
|
21
|
+
end
|
22
|
+
@uri = URI(url)
|
23
|
+
@crawled = crawled
|
24
|
+
@date_crawled = date_crawled
|
25
|
+
super(url)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.validate(url)
|
29
|
+
if Wgit::Url.relative_link?(url)
|
30
|
+
raise "Invalid url (or a relative link): #{url}"
|
31
|
+
end
|
32
|
+
unless url.start_with?("http://") or url.start_with?("https://")
|
33
|
+
raise "Invalid url (missing protocol prefix): #{url}"
|
34
|
+
end
|
35
|
+
if URI.regexp.match(url).nil?
|
36
|
+
raise "Invalid url: #{url}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.valid?(url)
|
41
|
+
Wgit::Url.validate(url)
|
42
|
+
true
|
43
|
+
rescue
|
44
|
+
false
|
45
|
+
end
|
46
|
+
|
47
|
+
# Modifies the receiver url by prefixing it with a protocol.
|
48
|
+
# Returns the url whether its been modified or not.
|
49
|
+
def self.prefix_protocol(url, https = false)
|
50
|
+
unless url.start_with?("http://") or url.start_with?("https://")
|
51
|
+
if https
|
52
|
+
url.replace("https://#{url}")
|
53
|
+
else
|
54
|
+
url.replace("http://#{url}")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
url
|
58
|
+
end
|
59
|
+
|
60
|
+
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
61
|
+
# array[2]: "www.google.co.uk", array[5]: "/about.html".
|
62
|
+
# This means that all external links in a page are expected to have a
|
63
|
+
# protocol prefix e.g. "http://", otherwise the link is treated as an
|
64
|
+
# internal link (regardless of whether it is valid or not).
|
65
|
+
def self.relative_link?(link)
|
66
|
+
link_segs = URI.split(link)
|
67
|
+
if not link_segs[2].nil? and not link_segs[2].empty?
|
68
|
+
false
|
69
|
+
elsif not link_segs[5].nil? and not link_segs[5].empty?
|
70
|
+
true
|
71
|
+
else
|
72
|
+
raise "Invalid link: #{link}"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.concat(host, link)
|
77
|
+
url = host
|
78
|
+
url.chop! if url.end_with?("/")
|
79
|
+
link = link[1..-1] if link.start_with?("/")
|
80
|
+
Wgit::Url.new(url + "/" + link)
|
81
|
+
end
|
82
|
+
|
83
|
+
def relative_link?
|
84
|
+
Wgit::Url.relative_link?(self)
|
85
|
+
end
|
86
|
+
|
87
|
+
def valid?
|
88
|
+
Wgit::Url.valid?(self)
|
89
|
+
end
|
90
|
+
|
91
|
+
def concat(link)
|
92
|
+
Wgit::Url.concat(self, link)
|
93
|
+
end
|
94
|
+
|
95
|
+
def crawled=(bool)
|
96
|
+
@crawled = bool
|
97
|
+
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_uri
|
101
|
+
@uri
|
102
|
+
end
|
103
|
+
|
104
|
+
def to_url
|
105
|
+
self
|
106
|
+
end
|
107
|
+
|
108
|
+
# Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
|
109
|
+
def to_host
|
110
|
+
Wgit::Url.new(@uri.host)
|
111
|
+
end
|
112
|
+
|
113
|
+
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
114
|
+
# array[0]: "http://", array[2]: "www.google.co.uk".
|
115
|
+
# Returns array[0] + array[2] e.g. http://www.google.co.uk.
|
116
|
+
def to_base
|
117
|
+
if Wgit::Url.relative_link?(self)
|
118
|
+
raise "A relative link doesn't have a base URL: #{self}"
|
119
|
+
end
|
120
|
+
url_segs = URI.split(self)
|
121
|
+
if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
|
122
|
+
raise "Both a protocol and host are needed: #{self}"
|
123
|
+
end
|
124
|
+
base = "#{url_segs[0]}://#{url_segs[2]}"
|
125
|
+
Wgit::Url.new(base)
|
126
|
+
end
|
127
|
+
|
128
|
+
def to_h
|
129
|
+
ignore = [:@uri]
|
130
|
+
h = Wgit::Utils.to_h(self, ignore)
|
131
|
+
Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
|
132
|
+
end
|
133
|
+
|
134
|
+
alias :to_hash :to_h
|
135
|
+
alias :host :to_host
|
136
|
+
alias :base :to_base
|
137
|
+
alias :internal_link? :relative_link?
|
138
|
+
alias :crawled? :crawled
|
139
|
+
end
|
140
|
+
end
|
data/lib/wgit/utils.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
|
2
|
+
module Wgit
|
3
|
+
|
4
|
+
# @author Michael Telford
|
5
|
+
# Utility module containing generic methods.
|
6
|
+
module Utils
|
7
|
+
def self.time_stamp
|
8
|
+
Time.new
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a hash created from obj's instance vars and values.
|
12
|
+
def self.to_h(obj, ignore = [])
|
13
|
+
hash = {}
|
14
|
+
obj.instance_variables.each do |var|
|
15
|
+
next if ignore.include?(var)
|
16
|
+
hash[var[1..-1].to_sym] = obj.instance_variable_get(var)
|
17
|
+
end
|
18
|
+
hash
|
19
|
+
end
|
20
|
+
|
21
|
+
# Improved each method which takes care of singleton and enumerable
|
22
|
+
# objects. Yields one or more objects.
|
23
|
+
def self.each(obj_or_objs)
|
24
|
+
if obj_or_objs.respond_to?(:each)
|
25
|
+
obj_or_objs.each { |obj| yield obj }
|
26
|
+
else
|
27
|
+
yield obj_or_objs
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Formats the sentence (modifies the receiver) and returns its value.
|
32
|
+
# The length will be based on the sentence_limit parameter or the full
|
33
|
+
# length of the original sentence, which ever is less. The full sentence
|
34
|
+
# is returned if the sentence_limit is 0. The algorithm obviously ensures
|
35
|
+
# that the search value is visible somewhere in the sentence.
|
36
|
+
def self.format_sentence_length(sentence, index, sentence_limit)
|
37
|
+
raise "A sentence value must be provided" if sentence.empty?
|
38
|
+
raise "The sentence length value must be even" if sentence_limit.odd?
|
39
|
+
if index < 0 or index > sentence.length
|
40
|
+
raise "Incorrect index value: #{index}"
|
41
|
+
end
|
42
|
+
|
43
|
+
return sentence if sentence_limit == 0
|
44
|
+
|
45
|
+
start = 0
|
46
|
+
finish = sentence.length
|
47
|
+
|
48
|
+
if sentence.length > sentence_limit
|
49
|
+
start = index - (sentence_limit / 2)
|
50
|
+
finish = index + (sentence_limit / 2)
|
51
|
+
|
52
|
+
if start < 0
|
53
|
+
diff = 0 - start
|
54
|
+
if (finish + diff) > sentence.length
|
55
|
+
finish = sentence.length
|
56
|
+
else
|
57
|
+
finish += diff
|
58
|
+
end
|
59
|
+
start = 0
|
60
|
+
elsif finish > sentence.length
|
61
|
+
diff = finish - sentence.length
|
62
|
+
if (start - diff) < 0
|
63
|
+
start = 0
|
64
|
+
else
|
65
|
+
start -= diff
|
66
|
+
end
|
67
|
+
finish = sentence.length
|
68
|
+
end
|
69
|
+
|
70
|
+
raise if sentence[start..(finish - 1)].length != sentence_limit
|
71
|
+
end
|
72
|
+
|
73
|
+
sentence.replace(sentence[start..(finish - 1)])
|
74
|
+
end
|
75
|
+
|
76
|
+
# Prints out the search results in a search engine page format.
|
77
|
+
# Most of the params are passed to Document#search - see class docs.
|
78
|
+
# The steam param decides where the printf output is written to, and
|
79
|
+
# therefore must respond_to? :puts
|
80
|
+
# The format for each result is:
|
81
|
+
#
|
82
|
+
# Title
|
83
|
+
# Keywords (if there are some)
|
84
|
+
# Text Snippet (showing the searched for text if provided)
|
85
|
+
# Url
|
86
|
+
# <empty_line>
|
87
|
+
def self.printf_search_results(results, text = nil, case_sensitive = false,
|
88
|
+
sentence_length = 80, keyword_count = 5,
|
89
|
+
stream = Kernel)
|
90
|
+
raise "stream must respond_to? :puts" unless stream.respond_to? :puts
|
91
|
+
keyword_count -= 1 # Because Array's are zero indexed.
|
92
|
+
|
93
|
+
results.each do |doc|
|
94
|
+
sentence = if text.nil?
|
95
|
+
nil
|
96
|
+
else
|
97
|
+
sentence = doc.search(text, sentence_length).first
|
98
|
+
if sentence.nil?
|
99
|
+
nil
|
100
|
+
else
|
101
|
+
sentence.strip.empty? ? nil : sentence
|
102
|
+
end
|
103
|
+
end
|
104
|
+
stream.puts doc.title
|
105
|
+
unless doc.keywords.empty?
|
106
|
+
stream.puts doc.keywords[0..keyword_count].join(", ")
|
107
|
+
end
|
108
|
+
stream.puts sentence unless sentence.nil?
|
109
|
+
stream.puts doc.url
|
110
|
+
stream.puts
|
111
|
+
end
|
112
|
+
nil
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
data/lib/wgit/version.rb
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative 'crawler'
|
4
|
+
require_relative 'database/database'
|
5
|
+
|
6
|
+
# @author Michael Telford
|
7
|
+
module Wgit
|
8
|
+
|
9
|
+
# Convience method to crawl the World Wide Web.
|
10
|
+
# The default value (-1) for max_sites_to_crawl is unrestricted.
|
11
|
+
# The default max_data_size is 1GB.
|
12
|
+
def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
13
|
+
db = Wgit::Database.new
|
14
|
+
web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
|
15
|
+
web_crawler.crawl_the_web
|
16
|
+
end
|
17
|
+
|
18
|
+
# Class which sets up a crawler and saves the indexed
|
19
|
+
# docs to a database. Will crawl the web forever if you let it :-)
|
20
|
+
class WebCrawler
|
21
|
+
attr_accessor :max_sites_to_crawl, :max_data_size
|
22
|
+
attr_reader :crawler, :db
|
23
|
+
|
24
|
+
def initialize(database,
|
25
|
+
max_sites_to_crawl = -1,
|
26
|
+
max_data_size = 1048576000)
|
27
|
+
@crawler = Wgit::Crawler.new
|
28
|
+
@db = database
|
29
|
+
@max_sites_to_crawl = max_sites_to_crawl
|
30
|
+
@max_data_size = max_data_size
|
31
|
+
end
|
32
|
+
|
33
|
+
# Retrieves url's from the database and recursively crawls each site
|
34
|
+
# storing their internal pages into the database and adding their external
|
35
|
+
# url's to be crawled at a later date.
|
36
|
+
def crawl_the_web
|
37
|
+
if max_sites_to_crawl < 0
|
38
|
+
puts "Crawling until the database has been filled or it runs out of \
|
39
|
+
urls to crawl (which might be never)."
|
40
|
+
end
|
41
|
+
loop_count = 0
|
42
|
+
|
43
|
+
while keep_crawling?(loop_count) do
|
44
|
+
puts "Current database size: #{db.size}"
|
45
|
+
crawler.urls = db.uncrawled_urls
|
46
|
+
|
47
|
+
if crawler.urls.empty?
|
48
|
+
puts "No urls to crawl, exiting."
|
49
|
+
break
|
50
|
+
end
|
51
|
+
puts "Starting crawl loop for: #{crawler.urls}"
|
52
|
+
|
53
|
+
docs_count = 0
|
54
|
+
urls_count = 0
|
55
|
+
|
56
|
+
crawler.urls.each do |url|
|
57
|
+
unless keep_crawling?(loop_count)
|
58
|
+
puts "Reached max number of sites to crawl or database \
|
59
|
+
capacity, exiting."
|
60
|
+
return
|
61
|
+
end
|
62
|
+
loop_count += 1
|
63
|
+
|
64
|
+
url.crawled = true
|
65
|
+
raise unless db.update(url) == 1
|
66
|
+
|
67
|
+
site_docs_count = 0
|
68
|
+
ext_links = crawler.crawl_site(url) do |doc|
|
69
|
+
unless doc.empty?
|
70
|
+
if write_doc_to_db(doc)
|
71
|
+
docs_count += 1
|
72
|
+
site_docs_count += 1
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
urls_count += write_urls_to_db(ext_links)
|
78
|
+
puts "Crawled and saved #{site_docs_count} docs for the \
|
79
|
+
site: #{url}"
|
80
|
+
end
|
81
|
+
|
82
|
+
puts "Crawled and saved docs for #{docs_count} url(s) overall for \
|
83
|
+
this iteration."
|
84
|
+
puts "Found and saved #{urls_count} external url(s) for the next \
|
85
|
+
iteration."
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
# Keep crawling or not based on DB size and current loop interation.
|
92
|
+
def keep_crawling?(loop_count)
|
93
|
+
return false if db.size >= max_data_size
|
94
|
+
# If max_sites_to_crawl is -1 for example then crawl away.
|
95
|
+
if max_sites_to_crawl < 0
|
96
|
+
true
|
97
|
+
else
|
98
|
+
loop_count < max_sites_to_crawl
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# The unique url index on the documents collection prevents duplicate
|
103
|
+
# inserts.
|
104
|
+
def write_doc_to_db(doc)
|
105
|
+
db.insert(doc)
|
106
|
+
puts "Saved document for url: #{doc.url}"
|
107
|
+
true
|
108
|
+
rescue Mongo::Error::OperationFailure
|
109
|
+
puts "Document already exists: #{doc.url}"
|
110
|
+
false
|
111
|
+
end
|
112
|
+
|
113
|
+
# The unique url index on the urls collection prevents duplicate inserts.
|
114
|
+
def write_urls_to_db(urls)
|
115
|
+
count = 0
|
116
|
+
if urls.respond_to?(:each)
|
117
|
+
urls.each do |url|
|
118
|
+
begin
|
119
|
+
db.insert(url)
|
120
|
+
count += 1
|
121
|
+
puts "Inserted url: #{url}"
|
122
|
+
rescue Mongo::Error::OperationFailure
|
123
|
+
puts "Url already exists: #{url}"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
count
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
if __FILE__ == $0
|
133
|
+
Wgit.crawl_the_web
|
134
|
+
end
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wgit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Michael Telford
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-03-07 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves their
|
14
|
+
page contents for later use. Also included in this package is a means to search
|
15
|
+
indexed documents stored in a database. Therefore this library provides the main
|
16
|
+
components of a WWW search engine. You can also use Wgit to copy entire website's
|
17
|
+
HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
|
18
|
+
you to easily pull out the parts of a webpage that are important to you, the CSS
|
19
|
+
or JS links for example.
|
20
|
+
email: michael.telford@live.com
|
21
|
+
executables: []
|
22
|
+
extensions: []
|
23
|
+
extra_rdoc_files: []
|
24
|
+
files:
|
25
|
+
- "./lib/wgit.rb"
|
26
|
+
- "./lib/wgit/assertable.rb"
|
27
|
+
- "./lib/wgit/core_ext.rb"
|
28
|
+
- "./lib/wgit/crawler.rb"
|
29
|
+
- "./lib/wgit/database/database.rb"
|
30
|
+
- "./lib/wgit/database/model.rb"
|
31
|
+
- "./lib/wgit/database/mongo_connection_details.rb"
|
32
|
+
- "./lib/wgit/document.rb"
|
33
|
+
- "./lib/wgit/url.rb"
|
34
|
+
- "./lib/wgit/utils.rb"
|
35
|
+
- "./lib/wgit/version.rb"
|
36
|
+
- "./lib/wgit/web_crawler.rb"
|
37
|
+
homepage: http://rubygems.org/gems/wgit
|
38
|
+
licenses:
|
39
|
+
- MIT
|
40
|
+
metadata:
|
41
|
+
allowed_push_host: https://rubygems.org
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
require_paths:
|
45
|
+
- lib
|
46
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 2.4.5
|
59
|
+
signing_key:
|
60
|
+
specification_version: 4
|
61
|
+
summary: Wgit is wget on steroids with an easy to use API.
|
62
|
+
test_files: []
|