wgit 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/wgit.rb +1 -1
- data/lib/wgit/assertable.rb +72 -61
- data/lib/wgit/core_ext.rb +11 -5
- data/lib/wgit/crawler.rb +97 -57
- data/lib/wgit/database/database.rb +247 -170
- data/lib/wgit/database/model.rb +40 -24
- data/lib/wgit/database/mongo_connection_details.rb +44 -23
- data/lib/wgit/document.rb +534 -233
- data/lib/wgit/indexer.rb +235 -0
- data/lib/wgit/url.rb +199 -121
- data/lib/wgit/utils.rb +143 -96
- data/lib/wgit/version.rb +5 -1
- metadata +10 -9
- data/lib/wgit/web_crawler.rb +0 -134
data/lib/wgit/utils.rb
CHANGED
@@ -1,115 +1,162 @@
|
|
1
|
-
|
2
1
|
module Wgit
|
3
2
|
|
4
|
-
# @author Michael Telford
|
5
3
|
# Utility module containing generic methods.
|
6
4
|
module Utils
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
|
6
|
+
# Returns the current time stamp.
|
7
|
+
#
|
8
|
+
# @return [Time] The current time stamp.
|
9
|
+
def self.time_stamp
|
10
|
+
Time.new
|
11
|
+
end
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
13
|
+
# Returns a Hash created from obj's instance vars and values.
|
14
|
+
#
|
15
|
+
# @param obj [Object] The object to process.
|
16
|
+
# @param ignore [Array<String>] Attributes to ignore.
|
17
|
+
# @param use_strings_as_keys [Boolean] Whether or not to use strings as
|
18
|
+
# the keys in the returned Hash. Symbols are used otherwise.
|
19
|
+
# @return [Hash] A Hash created from obj's instance vars and values.
|
20
|
+
def self.to_h(obj, ignore = [], use_strings_as_keys = true)
|
21
|
+
hash = {}
|
22
|
+
obj.instance_variables.each do |var|
|
23
|
+
next if ignore.include?(var.to_s)
|
24
|
+
key = var.to_s[1..-1]
|
25
|
+
key = key.to_sym unless use_strings_as_keys
|
26
|
+
hash[key] = obj.instance_variable_get(var)
|
19
27
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
28
|
+
hash
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the model having removed non bson types (for use with MongoDB).
|
32
|
+
#
|
33
|
+
# @param model_hash [Hash] The model Hash to process.
|
34
|
+
# @return [Hash] The model Hash with non bson types removed.
|
35
|
+
def self.remove_non_bson_types(model_hash)
|
36
|
+
model_hash.reject do |k, v|
|
37
|
+
not v.respond_to? :bson_type
|
29
38
|
end
|
39
|
+
end
|
30
40
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
return sentence if sentence_limit == 0
|
41
|
+
# An improved :each method which accepts both singleton and Enumerable
|
42
|
+
# objects (as opposed to just an Enumerable object).
|
43
|
+
#
|
44
|
+
# @yield [el] Gives each element of obj_or_objects if it's Enumerable,
|
45
|
+
# otherwise obj_or_objs itself is given.
|
46
|
+
def self.each(obj_or_objs)
|
47
|
+
if obj_or_objs.respond_to?(:each)
|
48
|
+
obj_or_objs.each { |obj| yield(obj) }
|
49
|
+
else
|
50
|
+
yield(obj_or_objs)
|
51
|
+
end
|
52
|
+
end
|
44
53
|
|
45
|
-
|
46
|
-
|
54
|
+
# Formats the sentence (modifies the receiver) and returns its value.
|
55
|
+
# The formatting is essentially to shorten the sentence and ensure that
|
56
|
+
# the index is present somewhere in the sentence. Used for search query
|
57
|
+
# results.
|
58
|
+
#
|
59
|
+
# @param sentence [String] The sentence to be formatted.
|
60
|
+
# @param index [Integer] The first index of a word in sentence. This is
|
61
|
+
# usually a word in a search query.
|
62
|
+
# @param sentence_limit [Integer] The max length of the formatted sentence
|
63
|
+
# being returned. The length will be based on the sentence_limit
|
64
|
+
# parameter or the full length of the original sentence, which ever
|
65
|
+
# is less. The full sentence is returned if the sentence_limit is 0.
|
66
|
+
# @return [String] The sentence once formatted.
|
67
|
+
def self.format_sentence_length(sentence, index, sentence_limit)
|
68
|
+
raise "A sentence value must be provided" if sentence.empty?
|
69
|
+
raise "The sentence length value must be even" if sentence_limit.odd?
|
70
|
+
if index < 0 or index > sentence.length
|
71
|
+
raise "Incorrect index value: #{index}"
|
72
|
+
end
|
73
|
+
|
74
|
+
return sentence if sentence_limit == 0
|
47
75
|
|
48
|
-
|
49
|
-
|
50
|
-
finish = index + (sentence_limit / 2)
|
76
|
+
start = 0
|
77
|
+
finish = sentence.length
|
51
78
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
finish = sentence.length
|
56
|
-
else
|
57
|
-
finish += diff
|
58
|
-
end
|
59
|
-
start = 0
|
60
|
-
elsif finish > sentence.length
|
61
|
-
diff = finish - sentence.length
|
62
|
-
if (start - diff) < 0
|
63
|
-
start = 0
|
64
|
-
else
|
65
|
-
start -= diff
|
66
|
-
end
|
67
|
-
finish = sentence.length
|
68
|
-
end
|
79
|
+
if sentence.length > sentence_limit
|
80
|
+
start = index - (sentence_limit / 2)
|
81
|
+
finish = index + (sentence_limit / 2)
|
69
82
|
|
70
|
-
|
83
|
+
if start < 0
|
84
|
+
diff = 0 - start
|
85
|
+
if (finish + diff) > sentence.length
|
86
|
+
finish = sentence.length
|
87
|
+
else
|
88
|
+
finish += diff
|
71
89
|
end
|
90
|
+
start = 0
|
91
|
+
elsif finish > sentence.length
|
92
|
+
diff = finish - sentence.length
|
93
|
+
if (start - diff) < 0
|
94
|
+
start = 0
|
95
|
+
else
|
96
|
+
start -= diff
|
97
|
+
end
|
98
|
+
finish = sentence.length
|
99
|
+
end
|
72
100
|
|
73
|
-
|
101
|
+
raise if sentence[start..(finish - 1)].length != sentence_limit
|
74
102
|
end
|
75
103
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
104
|
+
sentence.replace(sentence[start..(finish - 1)])
|
105
|
+
end
|
106
|
+
|
107
|
+
# Prints out the search results in a search engine like format.
|
108
|
+
# Most of the params are passed to Wgit::Document#search; see the docs.
|
109
|
+
# The format for each result looks like:
|
110
|
+
#
|
111
|
+
# Title
|
112
|
+
#
|
113
|
+
# Keywords (if there are some)
|
114
|
+
#
|
115
|
+
# Text Snippet (showing the searched for query if provided)
|
116
|
+
#
|
117
|
+
# URL
|
118
|
+
#
|
119
|
+
# <empty_line_seperator>
|
120
|
+
#
|
121
|
+
# @param results [Array<Wgit::Document>] An Array whose
|
122
|
+
# Wgit::Documents#text matches the query at least once.
|
123
|
+
# @param query [String] The text query to search for.
|
124
|
+
# @param case_sensitive [Boolean] Whether or not the search should be
|
125
|
+
# case sensitive or not.
|
126
|
+
# @param sentence_length [Integer] The length of the matching text of the
|
127
|
+
# search results to be outputted to the stream.
|
128
|
+
# @param keyword_count [Integer] The max amount of keywords to be
|
129
|
+
# outputted to the stream.
|
130
|
+
# @param stream [#puts] Any object that respond_to? :puts. It is used
|
131
|
+
# to output text somewhere e.g. STDOUT (the default).
|
132
|
+
# @return [nil]
|
133
|
+
def self.printf_search_results(results, query = nil, case_sensitive = false,
|
134
|
+
sentence_length = 80, keyword_count = 5,
|
135
|
+
stream = Kernel)
|
136
|
+
raise "stream must respond_to? :puts" unless stream.respond_to? :puts
|
137
|
+
keyword_count -= 1 # Because Array's are zero indexed.
|
138
|
+
|
139
|
+
results.each do |doc|
|
140
|
+
sentence = if query.nil?
|
141
|
+
nil
|
142
|
+
else
|
143
|
+
sentence = doc.search(query, sentence_length).first
|
144
|
+
if sentence.nil?
|
145
|
+
nil
|
146
|
+
else
|
147
|
+
sentence.strip.empty? ? nil : sentence
|
148
|
+
end
|
149
|
+
end
|
150
|
+
stream.puts doc.title
|
151
|
+
unless doc.keywords.nil? || doc.keywords.empty?
|
152
|
+
stream.puts doc.keywords[0..keyword_count].join(", ")
|
153
|
+
end
|
154
|
+
stream.puts sentence unless sentence.nil?
|
155
|
+
stream.puts doc.url
|
156
|
+
stream.puts
|
113
157
|
end
|
158
|
+
|
159
|
+
nil
|
160
|
+
end
|
114
161
|
end
|
115
162
|
end
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -15,8 +15,8 @@ description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves thei
|
|
15
15
|
indexed documents stored in a database. Therefore this library provides the main
|
16
16
|
components of a WWW search engine. You can also use Wgit to copy entire website's
|
17
17
|
HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
|
18
|
-
you to easily pull out the parts of a webpage that are important to you, the
|
19
|
-
or
|
18
|
+
you to easily pull out the parts of a webpage that are important to you, the external
|
19
|
+
links or keywords for example.
|
20
20
|
email: michael.telford@live.com
|
21
21
|
executables: []
|
22
22
|
extensions: []
|
@@ -30,14 +30,15 @@ files:
|
|
30
30
|
- "./lib/wgit/database/model.rb"
|
31
31
|
- "./lib/wgit/database/mongo_connection_details.rb"
|
32
32
|
- "./lib/wgit/document.rb"
|
33
|
+
- "./lib/wgit/indexer.rb"
|
33
34
|
- "./lib/wgit/url.rb"
|
34
35
|
- "./lib/wgit/utils.rb"
|
35
36
|
- "./lib/wgit/version.rb"
|
36
|
-
|
37
|
-
homepage: http://rubygems.org/gems/wgit
|
37
|
+
homepage: https://github.com/michaeltelford/wgit
|
38
38
|
licenses:
|
39
39
|
- MIT
|
40
40
|
metadata:
|
41
|
+
source_code_uri: https://github.com/michaeltelford/wgit
|
41
42
|
allowed_push_host: https://rubygems.org
|
42
43
|
post_install_message:
|
43
44
|
rdoc_options: []
|
@@ -45,9 +46,9 @@ require_paths:
|
|
45
46
|
- lib
|
46
47
|
required_ruby_version: !ruby/object:Gem::Requirement
|
47
48
|
requirements:
|
48
|
-
- - "
|
49
|
+
- - "~>"
|
49
50
|
- !ruby/object:Gem::Version
|
50
|
-
version: '
|
51
|
+
version: '2.5'
|
51
52
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
53
|
requirements:
|
53
54
|
- - ">="
|
@@ -55,8 +56,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
55
56
|
version: '0'
|
56
57
|
requirements: []
|
57
58
|
rubyforge_project:
|
58
|
-
rubygems_version: 2.
|
59
|
+
rubygems_version: 2.7.8
|
59
60
|
signing_key:
|
60
61
|
specification_version: 4
|
61
|
-
summary: Wgit is wget on steroids with an easy to use API.
|
62
|
+
summary: Wgit is wget on steroids with an easy to use API for web scraping and indexing.
|
62
63
|
test_files: []
|
data/lib/wgit/web_crawler.rb
DELETED
@@ -1,134 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require_relative 'crawler'
|
4
|
-
require_relative 'database/database'
|
5
|
-
|
6
|
-
# @author Michael Telford
|
7
|
-
module Wgit
|
8
|
-
|
9
|
-
# Convience method to crawl the World Wide Web.
|
10
|
-
# The default value (-1) for max_sites_to_crawl is unrestricted.
|
11
|
-
# The default max_data_size is 1GB.
|
12
|
-
def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
13
|
-
db = Wgit::Database.new
|
14
|
-
web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
|
15
|
-
web_crawler.crawl_the_web
|
16
|
-
end
|
17
|
-
|
18
|
-
# Class which sets up a crawler and saves the indexed
|
19
|
-
# docs to a database. Will crawl the web forever if you let it :-)
|
20
|
-
class WebCrawler
|
21
|
-
attr_accessor :max_sites_to_crawl, :max_data_size
|
22
|
-
attr_reader :crawler, :db
|
23
|
-
|
24
|
-
def initialize(database,
|
25
|
-
max_sites_to_crawl = -1,
|
26
|
-
max_data_size = 1048576000)
|
27
|
-
@crawler = Wgit::Crawler.new
|
28
|
-
@db = database
|
29
|
-
@max_sites_to_crawl = max_sites_to_crawl
|
30
|
-
@max_data_size = max_data_size
|
31
|
-
end
|
32
|
-
|
33
|
-
# Retrieves url's from the database and recursively crawls each site
|
34
|
-
# storing their internal pages into the database and adding their external
|
35
|
-
# url's to be crawled at a later date.
|
36
|
-
def crawl_the_web
|
37
|
-
if max_sites_to_crawl < 0
|
38
|
-
puts "Crawling until the database has been filled or it runs out of \
|
39
|
-
urls to crawl (which might be never)."
|
40
|
-
end
|
41
|
-
loop_count = 0
|
42
|
-
|
43
|
-
while keep_crawling?(loop_count) do
|
44
|
-
puts "Current database size: #{db.size}"
|
45
|
-
crawler.urls = db.uncrawled_urls
|
46
|
-
|
47
|
-
if crawler.urls.empty?
|
48
|
-
puts "No urls to crawl, exiting."
|
49
|
-
break
|
50
|
-
end
|
51
|
-
puts "Starting crawl loop for: #{crawler.urls}"
|
52
|
-
|
53
|
-
docs_count = 0
|
54
|
-
urls_count = 0
|
55
|
-
|
56
|
-
crawler.urls.each do |url|
|
57
|
-
unless keep_crawling?(loop_count)
|
58
|
-
puts "Reached max number of sites to crawl or database \
|
59
|
-
capacity, exiting."
|
60
|
-
return
|
61
|
-
end
|
62
|
-
loop_count += 1
|
63
|
-
|
64
|
-
url.crawled = true
|
65
|
-
raise unless db.update(url) == 1
|
66
|
-
|
67
|
-
site_docs_count = 0
|
68
|
-
ext_links = crawler.crawl_site(url) do |doc|
|
69
|
-
unless doc.empty?
|
70
|
-
if write_doc_to_db(doc)
|
71
|
-
docs_count += 1
|
72
|
-
site_docs_count += 1
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
urls_count += write_urls_to_db(ext_links)
|
78
|
-
puts "Crawled and saved #{site_docs_count} docs for the \
|
79
|
-
site: #{url}"
|
80
|
-
end
|
81
|
-
|
82
|
-
puts "Crawled and saved docs for #{docs_count} url(s) overall for \
|
83
|
-
this iteration."
|
84
|
-
puts "Found and saved #{urls_count} external url(s) for the next \
|
85
|
-
iteration."
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
private
|
90
|
-
|
91
|
-
# Keep crawling or not based on DB size and current loop interation.
|
92
|
-
def keep_crawling?(loop_count)
|
93
|
-
return false if db.size >= max_data_size
|
94
|
-
# If max_sites_to_crawl is -1 for example then crawl away.
|
95
|
-
if max_sites_to_crawl < 0
|
96
|
-
true
|
97
|
-
else
|
98
|
-
loop_count < max_sites_to_crawl
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
# The unique url index on the documents collection prevents duplicate
|
103
|
-
# inserts.
|
104
|
-
def write_doc_to_db(doc)
|
105
|
-
db.insert(doc)
|
106
|
-
puts "Saved document for url: #{doc.url}"
|
107
|
-
true
|
108
|
-
rescue Mongo::Error::OperationFailure
|
109
|
-
puts "Document already exists: #{doc.url}"
|
110
|
-
false
|
111
|
-
end
|
112
|
-
|
113
|
-
# The unique url index on the urls collection prevents duplicate inserts.
|
114
|
-
def write_urls_to_db(urls)
|
115
|
-
count = 0
|
116
|
-
if urls.respond_to?(:each)
|
117
|
-
urls.each do |url|
|
118
|
-
begin
|
119
|
-
db.insert(url)
|
120
|
-
count += 1
|
121
|
-
puts "Inserted url: #{url}"
|
122
|
-
rescue Mongo::Error::OperationFailure
|
123
|
-
puts "Url already exists: #{url}"
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
|
-
count
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
if __FILE__ == $0
|
133
|
-
Wgit.crawl_the_web
|
134
|
-
end
|