wgit 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/wgit.rb +11 -0
- data/lib/wgit/assertable.rb +69 -0
- data/lib/wgit/core_ext.rb +40 -0
- data/lib/wgit/crawler.rb +132 -0
- data/lib/wgit/database/database.rb +269 -0
- data/lib/wgit/database/model.rb +31 -0
- data/lib/wgit/database/mongo_connection_details.rb +27 -0
- data/lib/wgit/document.rb +293 -0
- data/lib/wgit/url.rb +140 -0
- data/lib/wgit/utils.rb +115 -0
- data/lib/wgit/version.rb +3 -0
- data/lib/wgit/web_crawler.rb +134 -0
- metadata +62 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 82f33e00a273c6cdeb3ba9c171110d849fff2428
|
4
|
+
data.tar.gz: 14c63f826d1d21811b14e9f3a2bca750b3f4afa3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7c42b925f72d9e7cceba79d9aee764f97b6537c0005038501a1f75c36b1bcd3b6036cfb9b62fcf01fd435e0348c1e8c00c445a291051c068fa58184de2c9590a
|
7
|
+
data.tar.gz: a2a756c3be7b9b214921bfdac5846a2250e452265285cb9c3b812d2eaefc2ab969b608cd1841f34507a6ef184f20ba7c98658daf0135fb85eead88de0356320f
|
data/lib/wgit.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require_relative 'wgit/version'
|
2
|
+
require_relative 'wgit/crawler'
|
3
|
+
require_relative 'wgit/web_crawler'
|
4
|
+
require_relative 'wgit/url'
|
5
|
+
require_relative 'wgit/document'
|
6
|
+
require_relative 'wgit/utils'
|
7
|
+
require_relative 'wgit/assertable'
|
8
|
+
require_relative 'wgit/database/database'
|
9
|
+
require_relative 'wgit/database/model'
|
10
|
+
require_relative 'wgit/database/mongo_connection_details'
|
11
|
+
#require_relative 'wgit/core_ext'
|
@@ -0,0 +1,69 @@
|
|
1
|
+
|
2
|
+
module Wgit
|
3
|
+
|
4
|
+
# @author Michael Telford
|
5
|
+
# Module containing assert methods including type checking which can be used
|
6
|
+
# for asserting the integrity of method definitions etc.
|
7
|
+
module Assertable
|
8
|
+
DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
|
9
|
+
WRONG_METHOD_MSG = "arr must be Enumerable, use a different method"
|
10
|
+
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
11
|
+
|
12
|
+
# obj.instance_of? must return true for one of the types listed in
|
13
|
+
# type_or_types or an exception is thrown using msg if provided.
|
14
|
+
# type_or_types can be a single Class or an Enumerable of Class objects,
|
15
|
+
# Strings and Symbols will not work.
|
16
|
+
def assert_types(obj, type_or_types, msg = nil)
|
17
|
+
msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
|
18
|
+
if type_or_types.respond_to?(:any?)
|
19
|
+
match = type_or_types.any? { |type| obj.instance_of?(type) }
|
20
|
+
else
|
21
|
+
match = obj.instance_of?(type_or_types)
|
22
|
+
end
|
23
|
+
raise msg unless match
|
24
|
+
obj
|
25
|
+
end
|
26
|
+
|
27
|
+
# Each object within arr must match one of the types listed in
|
28
|
+
# type_or_types or an exception is thrown using msg if provided.
|
29
|
+
# type_or_types can be a single Class or an Enumerable of Class objects,
|
30
|
+
# Strings and Symbols will not work.
|
31
|
+
def assert_arr_types(arr, type_or_types, msg = nil)
|
32
|
+
raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
|
33
|
+
arr.each do |obj|
|
34
|
+
assert_types(obj, type_or_types, msg)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# The obj_or_objs must respond_to? all of the given methods or an
|
39
|
+
# Exception is raised using msg or a default message.
|
40
|
+
# Returns obj_or_objs on sucessful assertion.
|
41
|
+
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
42
|
+
if obj_or_objs.respond_to?(:each)
|
43
|
+
obj_or_objs.each do |obj|
|
44
|
+
_assert_respond_to(obj, methods, msg)
|
45
|
+
end
|
46
|
+
else
|
47
|
+
_assert_respond_to(obj_or_objs, methods, msg)
|
48
|
+
end
|
49
|
+
obj_or_objs
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def _assert_respond_to(obj, methods, msg = nil)
|
55
|
+
msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
|
56
|
+
match = methods.all? { |method| obj.respond_to?(method) }
|
57
|
+
raise msg unless match
|
58
|
+
obj
|
59
|
+
end
|
60
|
+
|
61
|
+
alias :assert_type :assert_types
|
62
|
+
alias :type :assert_types
|
63
|
+
alias :types :assert_types
|
64
|
+
alias :assert_arr_type :assert_arr_types
|
65
|
+
alias :arr_type :assert_arr_types
|
66
|
+
alias :arr_types :assert_arr_types
|
67
|
+
alias :respond_to :assert_respond_to
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require_relative 'url'
|
2
|
+
|
3
|
+
# @author Michael Telford
|
4
|
+
# Script which extends Ruby's core functionality when parsed.
|
5
|
+
# Needs to be required separately using `require 'wgit/core_ext'`.
|
6
|
+
|
7
|
+
class String
|
8
|
+
# Converts a String into a Wgit::Url object.
|
9
|
+
def to_url
|
10
|
+
Wgit::Url.new(self)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
module Enumerable
|
15
|
+
# Converts each String instance into a Wgit::Url object and returns the new
|
16
|
+
# array.
|
17
|
+
def to_urls
|
18
|
+
map do |element|
|
19
|
+
process_url_element(element)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Converts each String instance into a Wgit::Url object and returns the
|
24
|
+
# updated array.
|
25
|
+
def to_urls!
|
26
|
+
map! do |element|
|
27
|
+
process_url_element(element)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def process_url_element(element)
|
35
|
+
if element.is_a? String
|
36
|
+
element.to_url
|
37
|
+
else
|
38
|
+
element
|
39
|
+
end
|
40
|
+
end
|
data/lib/wgit/crawler.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
require_relative 'url'
|
2
|
+
require_relative 'document'
|
3
|
+
require_relative 'utils'
|
4
|
+
require_relative 'assertable'
|
5
|
+
require 'net/http' # requires 'uri'
|
6
|
+
|
7
|
+
module Wgit
|
8
|
+
|
9
|
+
# @author Michael Telford
|
10
|
+
# Crawler class provides a means of crawling web URL's.
|
11
|
+
# Note that any redirects will not be followed for during crawling
|
12
|
+
# functionality.
|
13
|
+
class Crawler
|
14
|
+
include Assertable
|
15
|
+
|
16
|
+
attr_reader :urls, :docs
|
17
|
+
|
18
|
+
def initialize(*urls)
|
19
|
+
self.urls = urls unless urls.nil?
|
20
|
+
@docs = []
|
21
|
+
end
|
22
|
+
|
23
|
+
def urls=(urls)
|
24
|
+
@urls = []
|
25
|
+
Wgit::Utils.each(urls) { |url| add_url(url) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def [](*urls)
|
29
|
+
self.urls = urls unless urls.nil?
|
30
|
+
end
|
31
|
+
|
32
|
+
def <<(url)
|
33
|
+
add_url(url)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Crawls individual urls, not entire sites.
|
37
|
+
# Returns the last crawled doc.
|
38
|
+
# Yields each doc to the provided block or adds each doc to @docs
|
39
|
+
# which can be accessed by Crawler#docs after the method returns.
|
40
|
+
def crawl_urls(urls = @urls, &block)
|
41
|
+
raise "No urls to crawl" unless urls
|
42
|
+
@docs = []
|
43
|
+
doc = nil
|
44
|
+
Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
|
45
|
+
doc ? doc : @docs.last
|
46
|
+
end
|
47
|
+
|
48
|
+
# Crawl the url and return the response document or nil.
|
49
|
+
# Also yield(doc) if a block is provided. The doc is passed to the block
|
50
|
+
# regardless of the crawl success so the doc.url can be used if needed.
|
51
|
+
def crawl_url(url = @urls.first, &block)
|
52
|
+
assert_type(url, Url)
|
53
|
+
markup = fetch(url)
|
54
|
+
url.crawled = true
|
55
|
+
doc = Wgit::Document.new(url, markup)
|
56
|
+
block.call(doc) if block_given?
|
57
|
+
doc.empty? ? nil : doc
|
58
|
+
end
|
59
|
+
|
60
|
+
# Crawls an entire site by recursively going through its internal_links.
|
61
|
+
# Also yield(doc) for each crawled doc if a block is provided.
|
62
|
+
# A block is the only way to interact with the crawled docs.
|
63
|
+
# Returns a unique array of external urls collected from the site
|
64
|
+
# or nil if the base_url could not be crawled successfully.
|
65
|
+
def crawl_site(base_url = @urls.first, &block)
|
66
|
+
assert_type(base_url, Url)
|
67
|
+
|
68
|
+
doc = crawl_url(base_url, &block)
|
69
|
+
return nil if doc.nil?
|
70
|
+
|
71
|
+
crawled_urls = []
|
72
|
+
external_urls = doc.external_links
|
73
|
+
internal_urls = doc.internal_links
|
74
|
+
|
75
|
+
return doc.external_links.uniq if internal_urls.empty?
|
76
|
+
|
77
|
+
loop do
|
78
|
+
internal_urls.uniq! unless internal_urls.uniq.nil?
|
79
|
+
|
80
|
+
links = internal_urls - crawled_urls
|
81
|
+
break if links.empty?
|
82
|
+
|
83
|
+
links.each do |link|
|
84
|
+
doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
|
85
|
+
crawled_urls << link
|
86
|
+
next if doc.nil?
|
87
|
+
internal_urls.concat(doc.internal_links)
|
88
|
+
external_urls.concat(doc.external_links)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
external_urls.uniq
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
# Add the document to the @docs array for later processing
|
98
|
+
# or let the block process it here and now.
|
99
|
+
def handle_crawl_block(url, &block)
|
100
|
+
if not block_given?
|
101
|
+
@docs << crawl_url(url)
|
102
|
+
nil
|
103
|
+
else
|
104
|
+
crawl_url(url, &block)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# The fetch method performs a HTTP GET to obtain the HTML document.
|
109
|
+
# Invalid urls or any HTTP response that doesn't return a HTML body
|
110
|
+
# will be ignored and nil will be returned. This means that redirects
|
111
|
+
# etc. will not be followed.
|
112
|
+
def fetch(url)
|
113
|
+
raise unless url.respond_to?(:to_uri)
|
114
|
+
res = Net::HTTP.get_response(url.to_uri)
|
115
|
+
res.body.empty? ? nil : res.body
|
116
|
+
rescue
|
117
|
+
nil
|
118
|
+
end
|
119
|
+
|
120
|
+
def add_url(url)
|
121
|
+
@urls = [] if @urls.nil?
|
122
|
+
if url.instance_of?(Url)
|
123
|
+
@urls << url
|
124
|
+
else
|
125
|
+
@urls << Wgit::Url.new(url)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
alias :crawl :crawl_urls
|
130
|
+
alias :crawl_r :crawl_site
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
require_relative '../document'
|
2
|
+
require_relative '../url'
|
3
|
+
require_relative '../utils'
|
4
|
+
require_relative '../assertable'
|
5
|
+
require_relative 'mongo_connection_details'
|
6
|
+
require_relative 'model'
|
7
|
+
require 'mongo'
|
8
|
+
|
9
|
+
module Wgit
|
10
|
+
|
11
|
+
# @author Michael Telford
|
12
|
+
# Class modeling a DB connection and CRUD operations for the Url and
|
13
|
+
# Document collections.
|
14
|
+
# The most common methods are: insert, update, urls, search, stats, size.
|
15
|
+
class Database
|
16
|
+
include Assertable
|
17
|
+
|
18
|
+
# Is relative to the root project folder, not this file.
|
19
|
+
LOG_FILE_PATH = "misc/mongo_log.txt"
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
conn_details = Wgit::CONNECTION_DETAILS
|
23
|
+
if conn_details.empty?
|
24
|
+
raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
|
25
|
+
:port, :db, :uname, :pword for a database connection to be established."
|
26
|
+
end
|
27
|
+
|
28
|
+
logger = Logger.new(LOG_FILE_PATH)
|
29
|
+
address = "#{conn_details[:host]}:#{conn_details[:port]}"
|
30
|
+
@@client = Mongo::Client.new([address],
|
31
|
+
:database => conn_details[:db],
|
32
|
+
:user => conn_details[:uname],
|
33
|
+
:password => conn_details[:pword],
|
34
|
+
:logger => logger,
|
35
|
+
:truncate_logs => false)
|
36
|
+
end
|
37
|
+
|
38
|
+
### Create Data ###
|
39
|
+
|
40
|
+
def insert(data)
|
41
|
+
if data.is_a?(Url)
|
42
|
+
insert_urls(data)
|
43
|
+
elsif data.is_a?(Document)
|
44
|
+
insert_docs(data)
|
45
|
+
elsif data.respond_to?(:first)
|
46
|
+
if data.first.is_a?(Url)
|
47
|
+
insert_urls(data)
|
48
|
+
else
|
49
|
+
insert_docs(data)
|
50
|
+
end
|
51
|
+
else
|
52
|
+
raise "data is not in the correct format (all Url's or Document's)"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def insert_urls(url_or_urls)
|
57
|
+
unless url_or_urls.respond_to?(:map)
|
58
|
+
assert_type(url_or_urls, Url)
|
59
|
+
url_or_urls = Wgit::Model.url(url_or_urls)
|
60
|
+
else
|
61
|
+
assert_arr_types(url_or_urls, Url)
|
62
|
+
url_or_urls = url_or_urls.map do |url|
|
63
|
+
Wgit::Model.url(url)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
create(:urls, url_or_urls)
|
67
|
+
end
|
68
|
+
|
69
|
+
def insert_docs(doc_or_docs)
|
70
|
+
unless doc_or_docs.respond_to?(:map)
|
71
|
+
assert_type(doc_or_docs, [Document, Hash])
|
72
|
+
unless doc_or_docs.is_a?(Hash)
|
73
|
+
doc_or_docs = Wgit::Model.document(doc_or_docs)
|
74
|
+
end
|
75
|
+
else
|
76
|
+
assert_arr_types(doc_or_docs, [Document, Hash])
|
77
|
+
doc_or_docs = doc_or_docs.map do |doc|
|
78
|
+
Wgit::Model.document(doc) unless doc.is_a?(Hash)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
create(:documents, doc_or_docs)
|
82
|
+
end
|
83
|
+
|
84
|
+
### Retrieve Data ###
|
85
|
+
|
86
|
+
# A crawled parameter value of nil (the default) returns all urls.
|
87
|
+
# A limit of 0 means all urls are returned.
|
88
|
+
# All urls are sorted by date_added ascending, in other words the first
|
89
|
+
# url in the results is the first added.
|
90
|
+
def urls(crawled = nil, limit = 0, skip = 0, &block)
|
91
|
+
crawled.nil? ? query = {} : query = { :crawled => crawled }
|
92
|
+
|
93
|
+
sort = { :date_added => 1 }
|
94
|
+
results = retrieve(:urls, query, sort, {}, limit, skip)
|
95
|
+
return [] if results.count < 1
|
96
|
+
|
97
|
+
# results.respond_to? :map! is false so we use map and overwrite the var.
|
98
|
+
results = results.map { |url_doc| Wgit::Url.new(url_doc) }
|
99
|
+
return results unless block_given?
|
100
|
+
results.each { |url| block.call(url) }
|
101
|
+
end
|
102
|
+
|
103
|
+
def crawled_urls(limit = 0, skip = 0, &block)
|
104
|
+
urls(true, limit, skip, &block)
|
105
|
+
end
|
106
|
+
|
107
|
+
def uncrawled_urls(limit = 0, skip = 0, &block)
|
108
|
+
urls(false, limit, skip, &block)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Currently all searches are case insensitive.
|
112
|
+
#
|
113
|
+
# Searches against the indexed docs in the DB for the given text.
|
114
|
+
# The searched fields are decided by the text index setup against the
|
115
|
+
# documents collection. Currently we search against the following fields:
|
116
|
+
# "author", "keywords", "title" and "text".
|
117
|
+
#
|
118
|
+
# The MongoDB search ranks/sorts the results in order (highest first) based
|
119
|
+
# upon each documents textScore which records the number of text hits. We
|
120
|
+
# then store this textScore in each Document object for use elsewhere if
|
121
|
+
# needed.
|
122
|
+
#
|
123
|
+
# @param text [String] the value to search the data against.
|
124
|
+
# @param whole_sentence [Boolean] whether multiple words should be
|
125
|
+
# searched for separately.
|
126
|
+
# @param limit [Fixnum] the max length/count of the results array.
|
127
|
+
# @param skip [Fixnum] the number of results to skip, starting with the
|
128
|
+
# most relevant based upon the textScore of the search.
|
129
|
+
# @param block [Block] a block which if provided is passed to each result.
|
130
|
+
#
|
131
|
+
# @return [Array] of Document objects representing the search results.
|
132
|
+
def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
|
133
|
+
text.strip!
|
134
|
+
text.replace("\"" + text + "\"") if whole_sentence
|
135
|
+
|
136
|
+
# The textScore sorts based on the most search hits.
|
137
|
+
# We use the textScore hash as a sort and a projection below.
|
138
|
+
# :$caseSensitive => case_sensitive, # 3.2+ only.
|
139
|
+
sort_proj = { :score => { :$meta => "textScore" } }
|
140
|
+
query = { :$text => { :$search => text } }
|
141
|
+
results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
|
142
|
+
|
143
|
+
return [] if results.count < 1
|
144
|
+
# results.respond_to? :map! is false so we use map and overwrite the var.
|
145
|
+
results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
|
146
|
+
return results unless block_given?
|
147
|
+
results.each { |doc| block.call(doc) }
|
148
|
+
end
|
149
|
+
|
150
|
+
# Performs a search and pretty prints the results.
|
151
|
+
def search_p(text, whole_sentence = false, limit = 10,
|
152
|
+
skip = 0, sentence_length = 80, &block)
|
153
|
+
results = search(text, whole_sentence, limit, skip, &block)
|
154
|
+
Wgit::Utils.printf_search_results(results, text, false, sentence_length)
|
155
|
+
end
|
156
|
+
|
157
|
+
# Returns a Mongo object which can be used like a Hash to retrieve values.
|
158
|
+
def stats
|
159
|
+
@@client.command(:dbStats => 0).documents[0]
|
160
|
+
end
|
161
|
+
|
162
|
+
def size
|
163
|
+
stats[:dataSize]
|
164
|
+
end
|
165
|
+
|
166
|
+
### Update Data ###
|
167
|
+
|
168
|
+
def update(data)
|
169
|
+
if data.is_a?(Url)
|
170
|
+
update_url(data)
|
171
|
+
elsif data.is_a?(Document)
|
172
|
+
update_doc(data)
|
173
|
+
else
|
174
|
+
raise "data is not in the correct format (all Url's or Document's)"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def update_url(url)
|
179
|
+
assert_type(url, Url)
|
180
|
+
selection = { :url => url }
|
181
|
+
url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
|
182
|
+
update = { "$set" => url_hash }
|
183
|
+
_update(true, :urls, selection, update)
|
184
|
+
end
|
185
|
+
|
186
|
+
def update_doc(doc)
|
187
|
+
assert_type(doc, Document)
|
188
|
+
selection = { :url => doc.url }
|
189
|
+
doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
|
190
|
+
update = { "$set" => doc_hash }
|
191
|
+
_update(true, :documents, selection, update)
|
192
|
+
end
|
193
|
+
|
194
|
+
private
|
195
|
+
|
196
|
+
def write_succeeded?(result, count = 1, multi = false)
|
197
|
+
case result.class.to_s
|
198
|
+
# Single create result.
|
199
|
+
when "Mongo::Operation::Write::Insert::Result"
|
200
|
+
result.documents.first[:err].nil?
|
201
|
+
# Multiple create result.
|
202
|
+
when "Mongo::BulkWrite::Result"
|
203
|
+
result.inserted_count == count
|
204
|
+
# Single and multiple update result.
|
205
|
+
when "Mongo::Operation::Write::Update::Result", # MongoDB 3.0
|
206
|
+
"Mongo::Operation::Write::Update::LegacyResult" # MongoDB 2.4
|
207
|
+
if multi
|
208
|
+
result.n == count
|
209
|
+
else
|
210
|
+
result.documents.first[:err].nil?
|
211
|
+
end
|
212
|
+
else
|
213
|
+
raise "Result class not currently supported: #{result.class.to_s}"
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def create(collection, data)
|
218
|
+
assert_type(data, [Hash, Array])
|
219
|
+
# Single doc.
|
220
|
+
if data.is_a?(Hash)
|
221
|
+
data.merge!(Wgit::Model.common_insert_data)
|
222
|
+
result = @@client[collection.to_sym].insert_one(data)
|
223
|
+
unless write_succeeded?(result)
|
224
|
+
raise "DB write (insert) failed"
|
225
|
+
end
|
226
|
+
result.n
|
227
|
+
# Multiple docs.
|
228
|
+
elsif data.is_a?(Array)
|
229
|
+
assert_arr_types(data, Hash)
|
230
|
+
data.map! do |data_hash|
|
231
|
+
data_hash.merge(Wgit::Model.common_insert_data)
|
232
|
+
end
|
233
|
+
result = @@client[collection.to_sym].insert_many(data)
|
234
|
+
unless write_succeeded?(result, data.length)
|
235
|
+
raise "DB write(s) failed"
|
236
|
+
end
|
237
|
+
result.inserted_count
|
238
|
+
else
|
239
|
+
raise "data must be a Hash or an Array of Hash's"
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def retrieve(collection, query, sort = {}, projection = {},
|
244
|
+
limit = 0, skip = 0)
|
245
|
+
assert_type(query, Hash)
|
246
|
+
@@client[collection.to_sym].find(query).projection(projection)
|
247
|
+
.skip(skip).limit(limit).sort(sort)
|
248
|
+
end
|
249
|
+
|
250
|
+
# NOTE: The Model.common_update_data should be merged in the calling
|
251
|
+
# method as the update param can be bespoke due to its nature.
|
252
|
+
def _update(single, collection, selection, update)
|
253
|
+
assert_arr_types([selection, update], Hash)
|
254
|
+
if single
|
255
|
+
result = @@client[collection.to_sym].update_one(selection, update)
|
256
|
+
else
|
257
|
+
result = @@client[collection.to_sym].update_many(selection, update)
|
258
|
+
end
|
259
|
+
raise "DB write (update) failed" unless write_succeeded?(result)
|
260
|
+
result.n
|
261
|
+
end
|
262
|
+
|
263
|
+
alias :count :size
|
264
|
+
alias :length :size
|
265
|
+
alias :insert_url :insert_urls
|
266
|
+
alias :insert_doc :insert_docs
|
267
|
+
alias :search_and_format :search_p
|
268
|
+
end
|
269
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative '../utils'
|
2
|
+
|
3
|
+
module Wgit
|
4
|
+
|
5
|
+
# @author Michael Telford
|
6
|
+
# Module containing the DB data model structure.
|
7
|
+
module Model
|
8
|
+
def self.url(url)
|
9
|
+
raise "url must respond to to_h" unless url.respond_to?(:to_h)
|
10
|
+
url.to_h
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.document(doc)
|
14
|
+
raise "doc must respond to to_h" unless doc.respond_to?(:to_h)
|
15
|
+
doc.to_h(false)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.common_insert_data
|
19
|
+
{
|
20
|
+
:date_added => Wgit::Utils.time_stamp,
|
21
|
+
:date_modified => Wgit::Utils.time_stamp,
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.common_update_data
|
26
|
+
{
|
27
|
+
:date_modified => Wgit::Utils.time_stamp,
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
# @author Michael Telford
|
3
|
+
module Wgit
|
4
|
+
DB_PROVIDER = :MongoLabs.freeze
|
5
|
+
|
6
|
+
# OpenShift (MongoDB 2.4)
|
7
|
+
if DB_PROVIDER == :OpenShift
|
8
|
+
CONNECTION_DETAILS = {
|
9
|
+
:host => "127.0.0.1",
|
10
|
+
:port => "27017",
|
11
|
+
:db => "admin",
|
12
|
+
:uname => "admin",
|
13
|
+
:pword => "R5jUKv1fessb"
|
14
|
+
}.freeze
|
15
|
+
# MongoLabs (MongoDB 3.0)
|
16
|
+
elsif DB_PROVIDER == :MongoLabs
|
17
|
+
CONNECTION_DETAILS = {
|
18
|
+
:host => "ds037205.mongolab.com",
|
19
|
+
:port => "37205",
|
20
|
+
:db => "crawler",
|
21
|
+
:uname => "rubyapp",
|
22
|
+
:pword => "R5jUKv1fessb",
|
23
|
+
}.freeze
|
24
|
+
else
|
25
|
+
raise "Database provider '#{DB_PROVIDER}' is not recognized"
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,293 @@
|
|
1
|
+
require_relative 'url'
|
2
|
+
require_relative 'utils'
|
3
|
+
require_relative 'assertable'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
module Wgit
|
7
|
+
|
8
|
+
# @author Michael Telford
|
9
|
+
# Class modeling a HTML web document. Also doubles as a search result.
|
10
|
+
class Document
|
11
|
+
include Assertable
|
12
|
+
|
13
|
+
TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
|
14
|
+
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
|
15
|
+
|
16
|
+
attr_reader :url, :html, :title, :author, :keywords, :links, :text, :score
|
17
|
+
|
18
|
+
def initialize(url_or_doc, html = nil)
|
19
|
+
if (url_or_doc.is_a?(String))
|
20
|
+
assert_type(url_or_doc, Url)
|
21
|
+
html ||= ""
|
22
|
+
|
23
|
+
@url = url_or_doc
|
24
|
+
@html = html
|
25
|
+
|
26
|
+
@doc = Nokogiri::HTML(html) do |config|
|
27
|
+
# TODO: Remove #'s below when crawling in production.
|
28
|
+
#config.options = Nokogiri::XML::ParseOptions::STRICT |
|
29
|
+
# Nokogiri::XML::ParseOptions::NONET
|
30
|
+
end
|
31
|
+
|
32
|
+
init_title
|
33
|
+
init_author
|
34
|
+
init_keywords
|
35
|
+
init_links
|
36
|
+
init_text
|
37
|
+
@score = 0.0
|
38
|
+
else
|
39
|
+
# Init from a mongo collection document.
|
40
|
+
@url = Wgit::Url.new(url_or_doc[:url])
|
41
|
+
@html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
|
42
|
+
@title = url_or_doc[:title]
|
43
|
+
@author = url_or_doc[:author]
|
44
|
+
@keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
|
45
|
+
@links = url_or_doc[:links].nil? ? [] : url_or_doc[:links]
|
46
|
+
@links.map! { |link| Wgit::Url.new(link) }
|
47
|
+
@text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
|
48
|
+
@score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def internal_links
|
53
|
+
return [] if @links.empty?
|
54
|
+
@links.reject do |link|
|
55
|
+
begin
|
56
|
+
not link.relative_link?
|
57
|
+
rescue
|
58
|
+
true
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def internal_full_links
|
64
|
+
return [] if internal_links.empty?
|
65
|
+
internal_links.map do |link|
|
66
|
+
link.replace("/" + link) unless link.start_with?("/")
|
67
|
+
Wgit::Url.new(@url.to_base + link)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def external_links
|
72
|
+
return [] if @links.empty?
|
73
|
+
@links.reject do |link|
|
74
|
+
begin
|
75
|
+
link.relative_link?
|
76
|
+
rescue
|
77
|
+
true
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def stats
|
83
|
+
hash = {}
|
84
|
+
instance_variables.each do |var|
|
85
|
+
# Add up the total bytes of text as well as the length.
|
86
|
+
if var == :@text
|
87
|
+
count = 0
|
88
|
+
@text.each { |t| count += t.length }
|
89
|
+
hash[:text_length] = @text.length
|
90
|
+
hash[:text_bytes] = count
|
91
|
+
# Else take the #length method return value.
|
92
|
+
else
|
93
|
+
next unless instance_variable_get(var).respond_to?(:length)
|
94
|
+
hash[var[1..-1].to_sym] =
|
95
|
+
instance_variable_get(var).send(:length)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
hash
|
99
|
+
end
|
100
|
+
|
101
|
+
def size
|
102
|
+
stats[:html]
|
103
|
+
end
|
104
|
+
|
105
|
+
def to_h(include_html = false)
|
106
|
+
ignore = include_html ? [] : [:@html]
|
107
|
+
ignore << :@doc # Always ignore :@doc
|
108
|
+
Wgit::Utils.to_h(self, ignore)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Override of the default == method, is equal if url and html both match.
|
112
|
+
# Use doc.object_id == other_doc.object_id for exact object comparison.
|
113
|
+
def ==(other_doc)
|
114
|
+
return false unless other_doc.is_a? Wgit::Document
|
115
|
+
url == other_doc.url and html == other_doc.html
|
116
|
+
end
|
117
|
+
|
118
|
+
# Shortcut for calling Document#html[range].
|
119
|
+
def [](range)
|
120
|
+
html[range]
|
121
|
+
end
|
122
|
+
|
123
|
+
def empty?
|
124
|
+
html.strip.empty?
|
125
|
+
end
|
126
|
+
|
127
|
+
# Searches against the Document#text for the given search text.
|
128
|
+
# The number of search hits for each sentenence are recorded internally
|
129
|
+
# and used to rank/sort the search results before being returned. Where
|
130
|
+
# the Database#search method search all documents for the most hits this
|
131
|
+
# method searches each documents text for the most hits.
|
132
|
+
#
|
133
|
+
# Each search result comprises of a sentence of a given length. The length
|
134
|
+
# will be based on the sentence_limit parameter or the full length of the
|
135
|
+
# original sentence, which ever is less. The algorithm obviously ensures
|
136
|
+
# that the search value is visible somewhere in the sentence.
|
137
|
+
#
|
138
|
+
# @param text [String] the value to search the document text against.
|
139
|
+
# @param sentence_limit [Fixnum] the length of each search result
|
140
|
+
# sentence.
|
141
|
+
#
|
142
|
+
# @return [Array] of String objects representing the search results.
|
143
|
+
def search(text, sentence_limit = 80)
|
144
|
+
raise "A search value must be provided" if text.empty?
|
145
|
+
raise "The sentence length value must be even" if sentence_limit.odd?
|
146
|
+
|
147
|
+
results = {}
|
148
|
+
regex = Regexp.new(text, Regexp::IGNORECASE)
|
149
|
+
|
150
|
+
@text.each do |sentence|
|
151
|
+
hits = sentence.scan(regex).count
|
152
|
+
if hits > 0
|
153
|
+
sentence.strip!
|
154
|
+
index = sentence.index(regex)
|
155
|
+
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
156
|
+
results[sentence] = hits
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
return [] if results.empty?
|
161
|
+
results = Hash[results.sort_by { |k, v| v }]
|
162
|
+
results.keys.reverse
|
163
|
+
end
|
164
|
+
|
165
|
+
# Performs a text search (see search for details) but assigns the results
|
166
|
+
# to the @text instance variable. This can be used for sub search
|
167
|
+
# functionality. Note that there is no way of getting the original text
|
168
|
+
# back however.
|
169
|
+
def search!(text)
|
170
|
+
@text = search(text)
|
171
|
+
end
|
172
|
+
|
173
|
+
# Uses Nokogiri's xpath method to search the doc's html and return the
|
174
|
+
# results.
|
175
|
+
def xpath(xpath)
|
176
|
+
@doc.xpath(xpath)
|
177
|
+
end
|
178
|
+
|
179
|
+
private
|
180
|
+
|
181
|
+
def process_str(str)
|
182
|
+
str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
183
|
+
str.strip!
|
184
|
+
str # This is required to return the str, do not remove.
|
185
|
+
end
|
186
|
+
|
187
|
+
def process_arr(array)
|
188
|
+
assert_arr_types(array, String)
|
189
|
+
array.map! { |str| process_str(str) }
|
190
|
+
array.reject! { |str| str.empty? }
|
191
|
+
array.uniq!
|
192
|
+
end
|
193
|
+
|
194
|
+
# Modifies internal links by removing this doc's base or host url if
|
195
|
+
# present. http://www.google.co.uk/about.html (with or without the
|
196
|
+
# protocol prefix) will become about.html meaning it'll appear within
|
197
|
+
# internal_links.
|
198
|
+
def process_internal_links(links)
|
199
|
+
links.map! do |link|
|
200
|
+
host_or_base = if link.start_with?("http")
|
201
|
+
url.base
|
202
|
+
else
|
203
|
+
url.host
|
204
|
+
end
|
205
|
+
if link.start_with?(host_or_base)
|
206
|
+
link.sub!(host_or_base, "")
|
207
|
+
link.replace(link[1..-1]) if link.start_with?("/")
|
208
|
+
link.strip!
|
209
|
+
end
|
210
|
+
link
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def text_elements_xpath
|
215
|
+
xpath = ""
|
216
|
+
return xpath if TEXT_ELEMENTS.empty?
|
217
|
+
el_xpath = "//%s/text()"
|
218
|
+
TEXT_ELEMENTS.each_with_index do |el, i|
|
219
|
+
xpath += " | " unless i == 0
|
220
|
+
xpath += el_xpath % [el]
|
221
|
+
end
|
222
|
+
xpath
|
223
|
+
end
|
224
|
+
|
225
|
+
def init_var(xpath, var, first_result = true)
|
226
|
+
results = @doc.xpath(xpath)
|
227
|
+
unless results.nil? || results.empty?
|
228
|
+
result = if first_result
|
229
|
+
results.first.content
|
230
|
+
else
|
231
|
+
results.map { |res| res.content }
|
232
|
+
end
|
233
|
+
instance_variable_set(var, result)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def init_title
|
238
|
+
@title = nil
|
239
|
+
xpath = "//title"
|
240
|
+
init_var(xpath, :@title)
|
241
|
+
process_str(@title) unless @title.nil?
|
242
|
+
end
|
243
|
+
|
244
|
+
def init_author
|
245
|
+
@author = nil
|
246
|
+
xpath = "//meta[@name='author']/@content"
|
247
|
+
init_var(xpath, :@author)
|
248
|
+
process_str(@author) unless @author.nil?
|
249
|
+
end
|
250
|
+
|
251
|
+
def init_keywords
|
252
|
+
@keywords = nil
|
253
|
+
xpath = "//meta[@name='keywords']/@content"
|
254
|
+
init_var(xpath, :@keywords)
|
255
|
+
return @keywords = [] unless @keywords
|
256
|
+
@keywords = @keywords.split(",")
|
257
|
+
process_arr(@keywords)
|
258
|
+
end
|
259
|
+
|
260
|
+
def init_links
|
261
|
+
@links = nil
|
262
|
+
xpath = "//a/@href"
|
263
|
+
init_var(xpath, :@links, false)
|
264
|
+
return @links = [] unless @links
|
265
|
+
process_arr(@links)
|
266
|
+
@links.reject! { |link| link == "/" }
|
267
|
+
@links.map! do |link|
|
268
|
+
begin
|
269
|
+
Wgit::Url.new(link)
|
270
|
+
rescue
|
271
|
+
nil
|
272
|
+
end
|
273
|
+
end
|
274
|
+
@links.reject! { |link| link.nil? }
|
275
|
+
process_internal_links(@links)
|
276
|
+
end
|
277
|
+
|
278
|
+
def init_text
|
279
|
+
@text = nil
|
280
|
+
xpath = text_elements_xpath
|
281
|
+
init_var(xpath, :@text, false)
|
282
|
+
return @text = [] unless @text
|
283
|
+
process_arr(@text)
|
284
|
+
end
|
285
|
+
|
286
|
+
alias :to_hash :to_h
|
287
|
+
alias :relative_links :internal_links
|
288
|
+
alias :relative_urls :internal_links
|
289
|
+
alias :relative_full_links :internal_full_links
|
290
|
+
alias :relative_full_urls :internal_full_links
|
291
|
+
alias :external_urls :external_links
|
292
|
+
end
|
293
|
+
end
|
data/lib/wgit/url.rb
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
require_relative 'utils'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Wgit
|
5
|
+
|
6
|
+
# @author Michael Telford
|
7
|
+
# Class modeling a web based URL.
|
8
|
+
# Can be an internal link e.g. "about.html"
|
9
|
+
# or a full URL e.g. "http://www.google.co.uk".
|
10
|
+
class Url < String
|
11
|
+
attr_accessor :crawled, :date_crawled
|
12
|
+
|
13
|
+
def initialize(url_or_doc, crawled = false, date_crawled = nil)
|
14
|
+
if (url_or_doc.is_a?(String))
|
15
|
+
url = url_or_doc
|
16
|
+
else
|
17
|
+
# Init from a mongo collection document.
|
18
|
+
url = url_or_doc[:url]
|
19
|
+
crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
|
20
|
+
date_crawled = url_or_doc[:date_crawled]
|
21
|
+
end
|
22
|
+
@uri = URI(url)
|
23
|
+
@crawled = crawled
|
24
|
+
@date_crawled = date_crawled
|
25
|
+
super(url)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.validate(url)
|
29
|
+
if Wgit::Url.relative_link?(url)
|
30
|
+
raise "Invalid url (or a relative link): #{url}"
|
31
|
+
end
|
32
|
+
unless url.start_with?("http://") or url.start_with?("https://")
|
33
|
+
raise "Invalid url (missing protocol prefix): #{url}"
|
34
|
+
end
|
35
|
+
if URI.regexp.match(url).nil?
|
36
|
+
raise "Invalid url: #{url}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.valid?(url)
|
41
|
+
Wgit::Url.validate(url)
|
42
|
+
true
|
43
|
+
rescue
|
44
|
+
false
|
45
|
+
end
|
46
|
+
|
47
|
+
# Modifies the receiver url by prefixing it with a protocol.
|
48
|
+
# Returns the url whether its been modified or not.
|
49
|
+
def self.prefix_protocol(url, https = false)
|
50
|
+
unless url.start_with?("http://") or url.start_with?("https://")
|
51
|
+
if https
|
52
|
+
url.replace("https://#{url}")
|
53
|
+
else
|
54
|
+
url.replace("http://#{url}")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
url
|
58
|
+
end
|
59
|
+
|
60
|
+
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
61
|
+
# array[2]: "www.google.co.uk", array[5]: "/about.html".
|
62
|
+
# This means that all external links in a page are expected to have a
|
63
|
+
# protocol prefix e.g. "http://", otherwise the link is treated as an
|
64
|
+
# internal link (regardless of whether it is valid or not).
|
65
|
+
def self.relative_link?(link)
|
66
|
+
link_segs = URI.split(link)
|
67
|
+
if not link_segs[2].nil? and not link_segs[2].empty?
|
68
|
+
false
|
69
|
+
elsif not link_segs[5].nil? and not link_segs[5].empty?
|
70
|
+
true
|
71
|
+
else
|
72
|
+
raise "Invalid link: #{link}"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.concat(host, link)
|
77
|
+
url = host
|
78
|
+
url.chop! if url.end_with?("/")
|
79
|
+
link = link[1..-1] if link.start_with?("/")
|
80
|
+
Wgit::Url.new(url + "/" + link)
|
81
|
+
end
|
82
|
+
|
83
|
+
def relative_link?
|
84
|
+
Wgit::Url.relative_link?(self)
|
85
|
+
end
|
86
|
+
|
87
|
+
def valid?
|
88
|
+
Wgit::Url.valid?(self)
|
89
|
+
end
|
90
|
+
|
91
|
+
def concat(link)
|
92
|
+
Wgit::Url.concat(self, link)
|
93
|
+
end
|
94
|
+
|
95
|
+
def crawled=(bool)
|
96
|
+
@crawled = bool
|
97
|
+
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_uri
|
101
|
+
@uri
|
102
|
+
end
|
103
|
+
|
104
|
+
def to_url
|
105
|
+
self
|
106
|
+
end
|
107
|
+
|
108
|
+
# Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
|
109
|
+
def to_host
|
110
|
+
Wgit::Url.new(@uri.host)
|
111
|
+
end
|
112
|
+
|
113
|
+
# URI.split("http://www.google.co.uk/about.html") returns the following:
|
114
|
+
# array[0]: "http://", array[2]: "www.google.co.uk".
|
115
|
+
# Returns array[0] + array[2] e.g. http://www.google.co.uk.
|
116
|
+
def to_base
|
117
|
+
if Wgit::Url.relative_link?(self)
|
118
|
+
raise "A relative link doesn't have a base URL: #{self}"
|
119
|
+
end
|
120
|
+
url_segs = URI.split(self)
|
121
|
+
if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
|
122
|
+
raise "Both a protocol and host are needed: #{self}"
|
123
|
+
end
|
124
|
+
base = "#{url_segs[0]}://#{url_segs[2]}"
|
125
|
+
Wgit::Url.new(base)
|
126
|
+
end
|
127
|
+
|
128
|
+
def to_h
|
129
|
+
ignore = [:@uri]
|
130
|
+
h = Wgit::Utils.to_h(self, ignore)
|
131
|
+
Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
|
132
|
+
end
|
133
|
+
|
134
|
+
alias :to_hash :to_h
|
135
|
+
alias :host :to_host
|
136
|
+
alias :base :to_base
|
137
|
+
alias :internal_link? :relative_link?
|
138
|
+
alias :crawled? :crawled
|
139
|
+
end
|
140
|
+
end
|
data/lib/wgit/utils.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
|
2
|
+
module Wgit
|
3
|
+
|
4
|
+
# @author Michael Telford
|
5
|
+
# Utility module containing generic methods.
|
6
|
+
module Utils
|
7
|
+
def self.time_stamp
|
8
|
+
Time.new
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a hash created from obj's instance vars and values.
|
12
|
+
def self.to_h(obj, ignore = [])
|
13
|
+
hash = {}
|
14
|
+
obj.instance_variables.each do |var|
|
15
|
+
next if ignore.include?(var)
|
16
|
+
hash[var[1..-1].to_sym] = obj.instance_variable_get(var)
|
17
|
+
end
|
18
|
+
hash
|
19
|
+
end
|
20
|
+
|
21
|
+
# Improved each method which takes care of singleton and enumerable
|
22
|
+
# objects. Yields one or more objects.
|
23
|
+
def self.each(obj_or_objs)
|
24
|
+
if obj_or_objs.respond_to?(:each)
|
25
|
+
obj_or_objs.each { |obj| yield obj }
|
26
|
+
else
|
27
|
+
yield obj_or_objs
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Formats the sentence (modifies the receiver) and returns its value.
|
32
|
+
# The length will be based on the sentence_limit parameter or the full
|
33
|
+
# length of the original sentence, which ever is less. The full sentence
|
34
|
+
# is returned if the sentence_limit is 0. The algorithm obviously ensures
|
35
|
+
# that the search value is visible somewhere in the sentence.
|
36
|
+
def self.format_sentence_length(sentence, index, sentence_limit)
|
37
|
+
raise "A sentence value must be provided" if sentence.empty?
|
38
|
+
raise "The sentence length value must be even" if sentence_limit.odd?
|
39
|
+
if index < 0 or index > sentence.length
|
40
|
+
raise "Incorrect index value: #{index}"
|
41
|
+
end
|
42
|
+
|
43
|
+
return sentence if sentence_limit == 0
|
44
|
+
|
45
|
+
start = 0
|
46
|
+
finish = sentence.length
|
47
|
+
|
48
|
+
if sentence.length > sentence_limit
|
49
|
+
start = index - (sentence_limit / 2)
|
50
|
+
finish = index + (sentence_limit / 2)
|
51
|
+
|
52
|
+
if start < 0
|
53
|
+
diff = 0 - start
|
54
|
+
if (finish + diff) > sentence.length
|
55
|
+
finish = sentence.length
|
56
|
+
else
|
57
|
+
finish += diff
|
58
|
+
end
|
59
|
+
start = 0
|
60
|
+
elsif finish > sentence.length
|
61
|
+
diff = finish - sentence.length
|
62
|
+
if (start - diff) < 0
|
63
|
+
start = 0
|
64
|
+
else
|
65
|
+
start -= diff
|
66
|
+
end
|
67
|
+
finish = sentence.length
|
68
|
+
end
|
69
|
+
|
70
|
+
raise if sentence[start..(finish - 1)].length != sentence_limit
|
71
|
+
end
|
72
|
+
|
73
|
+
sentence.replace(sentence[start..(finish - 1)])
|
74
|
+
end
|
75
|
+
|
76
|
+
# Prints out the search results in a search engine page format.
|
77
|
+
# Most of the params are passed to Document#search - see class docs.
|
78
|
+
# The steam param decides where the printf output is written to, and
|
79
|
+
# therefore must respond_to? :puts
|
80
|
+
# The format for each result is:
|
81
|
+
#
|
82
|
+
# Title
|
83
|
+
# Keywords (if there are some)
|
84
|
+
# Text Snippet (showing the searched for text if provided)
|
85
|
+
# Url
|
86
|
+
# <empty_line>
|
87
|
+
def self.printf_search_results(results, text = nil, case_sensitive = false,
|
88
|
+
sentence_length = 80, keyword_count = 5,
|
89
|
+
stream = Kernel)
|
90
|
+
raise "stream must respond_to? :puts" unless stream.respond_to? :puts
|
91
|
+
keyword_count -= 1 # Because Array's are zero indexed.
|
92
|
+
|
93
|
+
results.each do |doc|
|
94
|
+
sentence = if text.nil?
|
95
|
+
nil
|
96
|
+
else
|
97
|
+
sentence = doc.search(text, sentence_length).first
|
98
|
+
if sentence.nil?
|
99
|
+
nil
|
100
|
+
else
|
101
|
+
sentence.strip.empty? ? nil : sentence
|
102
|
+
end
|
103
|
+
end
|
104
|
+
stream.puts doc.title
|
105
|
+
unless doc.keywords.empty?
|
106
|
+
stream.puts doc.keywords[0..keyword_count].join(", ")
|
107
|
+
end
|
108
|
+
stream.puts sentence unless sentence.nil?
|
109
|
+
stream.puts doc.url
|
110
|
+
stream.puts
|
111
|
+
end
|
112
|
+
nil
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
data/lib/wgit/version.rb
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative 'crawler'
|
4
|
+
require_relative 'database/database'
|
5
|
+
|
6
|
+
# @author Michael Telford
|
7
|
+
module Wgit
|
8
|
+
|
9
|
+
# Convience method to crawl the World Wide Web.
|
10
|
+
# The default value (-1) for max_sites_to_crawl is unrestricted.
|
11
|
+
# The default max_data_size is 1GB.
|
12
|
+
def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
13
|
+
db = Wgit::Database.new
|
14
|
+
web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
|
15
|
+
web_crawler.crawl_the_web
|
16
|
+
end
|
17
|
+
|
18
|
+
# Class which sets up a crawler and saves the indexed
|
19
|
+
# docs to a database. Will crawl the web forever if you let it :-)
|
20
|
+
class WebCrawler
|
21
|
+
attr_accessor :max_sites_to_crawl, :max_data_size
|
22
|
+
attr_reader :crawler, :db
|
23
|
+
|
24
|
+
def initialize(database,
|
25
|
+
max_sites_to_crawl = -1,
|
26
|
+
max_data_size = 1048576000)
|
27
|
+
@crawler = Wgit::Crawler.new
|
28
|
+
@db = database
|
29
|
+
@max_sites_to_crawl = max_sites_to_crawl
|
30
|
+
@max_data_size = max_data_size
|
31
|
+
end
|
32
|
+
|
33
|
+
# Retrieves url's from the database and recursively crawls each site
|
34
|
+
# storing their internal pages into the database and adding their external
|
35
|
+
# url's to be crawled at a later date.
|
36
|
+
def crawl_the_web
|
37
|
+
if max_sites_to_crawl < 0
|
38
|
+
puts "Crawling until the database has been filled or it runs out of \
|
39
|
+
urls to crawl (which might be never)."
|
40
|
+
end
|
41
|
+
loop_count = 0
|
42
|
+
|
43
|
+
while keep_crawling?(loop_count) do
|
44
|
+
puts "Current database size: #{db.size}"
|
45
|
+
crawler.urls = db.uncrawled_urls
|
46
|
+
|
47
|
+
if crawler.urls.empty?
|
48
|
+
puts "No urls to crawl, exiting."
|
49
|
+
break
|
50
|
+
end
|
51
|
+
puts "Starting crawl loop for: #{crawler.urls}"
|
52
|
+
|
53
|
+
docs_count = 0
|
54
|
+
urls_count = 0
|
55
|
+
|
56
|
+
crawler.urls.each do |url|
|
57
|
+
unless keep_crawling?(loop_count)
|
58
|
+
puts "Reached max number of sites to crawl or database \
|
59
|
+
capacity, exiting."
|
60
|
+
return
|
61
|
+
end
|
62
|
+
loop_count += 1
|
63
|
+
|
64
|
+
url.crawled = true
|
65
|
+
raise unless db.update(url) == 1
|
66
|
+
|
67
|
+
site_docs_count = 0
|
68
|
+
ext_links = crawler.crawl_site(url) do |doc|
|
69
|
+
unless doc.empty?
|
70
|
+
if write_doc_to_db(doc)
|
71
|
+
docs_count += 1
|
72
|
+
site_docs_count += 1
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
urls_count += write_urls_to_db(ext_links)
|
78
|
+
puts "Crawled and saved #{site_docs_count} docs for the \
|
79
|
+
site: #{url}"
|
80
|
+
end
|
81
|
+
|
82
|
+
puts "Crawled and saved docs for #{docs_count} url(s) overall for \
|
83
|
+
this iteration."
|
84
|
+
puts "Found and saved #{urls_count} external url(s) for the next \
|
85
|
+
iteration."
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
# Keep crawling or not based on DB size and current loop interation.
|
92
|
+
def keep_crawling?(loop_count)
|
93
|
+
return false if db.size >= max_data_size
|
94
|
+
# If max_sites_to_crawl is -1 for example then crawl away.
|
95
|
+
if max_sites_to_crawl < 0
|
96
|
+
true
|
97
|
+
else
|
98
|
+
loop_count < max_sites_to_crawl
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# The unique url index on the documents collection prevents duplicate
|
103
|
+
# inserts.
|
104
|
+
def write_doc_to_db(doc)
|
105
|
+
db.insert(doc)
|
106
|
+
puts "Saved document for url: #{doc.url}"
|
107
|
+
true
|
108
|
+
rescue Mongo::Error::OperationFailure
|
109
|
+
puts "Document already exists: #{doc.url}"
|
110
|
+
false
|
111
|
+
end
|
112
|
+
|
113
|
+
# The unique url index on the urls collection prevents duplicate inserts.
|
114
|
+
def write_urls_to_db(urls)
|
115
|
+
count = 0
|
116
|
+
if urls.respond_to?(:each)
|
117
|
+
urls.each do |url|
|
118
|
+
begin
|
119
|
+
db.insert(url)
|
120
|
+
count += 1
|
121
|
+
puts "Inserted url: #{url}"
|
122
|
+
rescue Mongo::Error::OperationFailure
|
123
|
+
puts "Url already exists: #{url}"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
count
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
if __FILE__ == $0
|
133
|
+
Wgit.crawl_the_web
|
134
|
+
end
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wgit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Michael Telford
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-03-07 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves their
|
14
|
+
page contents for later use. Also included in this package is a means to search
|
15
|
+
indexed documents stored in a database. Therefore this library provides the main
|
16
|
+
components of a WWW search engine. You can also use Wgit to copy entire website's
|
17
|
+
HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
|
18
|
+
you to easily pull out the parts of a webpage that are important to you, the CSS
|
19
|
+
or JS links for example.
|
20
|
+
email: michael.telford@live.com
|
21
|
+
executables: []
|
22
|
+
extensions: []
|
23
|
+
extra_rdoc_files: []
|
24
|
+
files:
|
25
|
+
- "./lib/wgit.rb"
|
26
|
+
- "./lib/wgit/assertable.rb"
|
27
|
+
- "./lib/wgit/core_ext.rb"
|
28
|
+
- "./lib/wgit/crawler.rb"
|
29
|
+
- "./lib/wgit/database/database.rb"
|
30
|
+
- "./lib/wgit/database/model.rb"
|
31
|
+
- "./lib/wgit/database/mongo_connection_details.rb"
|
32
|
+
- "./lib/wgit/document.rb"
|
33
|
+
- "./lib/wgit/url.rb"
|
34
|
+
- "./lib/wgit/utils.rb"
|
35
|
+
- "./lib/wgit/version.rb"
|
36
|
+
- "./lib/wgit/web_crawler.rb"
|
37
|
+
homepage: http://rubygems.org/gems/wgit
|
38
|
+
licenses:
|
39
|
+
- MIT
|
40
|
+
metadata:
|
41
|
+
allowed_push_host: https://rubygems.org
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
require_paths:
|
45
|
+
- lib
|
46
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 2.4.5
|
59
|
+
signing_key:
|
60
|
+
specification_version: 4
|
61
|
+
summary: Wgit is wget on steroids with an easy to use API.
|
62
|
+
test_files: []
|