wgit 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/wgit.rb +1 -1
- data/lib/wgit/assertable.rb +72 -61
- data/lib/wgit/core_ext.rb +11 -5
- data/lib/wgit/crawler.rb +97 -57
- data/lib/wgit/database/database.rb +247 -170
- data/lib/wgit/database/model.rb +40 -24
- data/lib/wgit/database/mongo_connection_details.rb +44 -23
- data/lib/wgit/document.rb +534 -233
- data/lib/wgit/indexer.rb +235 -0
- data/lib/wgit/url.rb +199 -121
- data/lib/wgit/utils.rb +143 -96
- data/lib/wgit/version.rb +5 -1
- metadata +10 -9
- data/lib/wgit/web_crawler.rb +0 -134
data/lib/wgit/utils.rb
CHANGED
@@ -1,115 +1,162 @@
|
|
1
|
-
|
2
1
|
module Wgit
|
3
2
|
|
4
|
-
# @author Michael Telford
|
5
3
|
# Utility module containing generic methods.
|
6
4
|
module Utils
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
|
6
|
+
# Returns the current time stamp.
|
7
|
+
#
|
8
|
+
# @return [Time] The current time stamp.
|
9
|
+
def self.time_stamp
|
10
|
+
Time.new
|
11
|
+
end
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
13
|
+
# Returns a Hash created from obj's instance vars and values.
|
14
|
+
#
|
15
|
+
# @param obj [Object] The object to process.
|
16
|
+
# @param ignore [Array<String>] Attributes to ignore.
|
17
|
+
# @param use_strings_as_keys [Boolean] Whether or not to use strings as
|
18
|
+
# the keys in the returned Hash. Symbols are used otherwise.
|
19
|
+
# @return [Hash] A Hash created from obj's instance vars and values.
|
20
|
+
def self.to_h(obj, ignore = [], use_strings_as_keys = true)
|
21
|
+
hash = {}
|
22
|
+
obj.instance_variables.each do |var|
|
23
|
+
next if ignore.include?(var.to_s)
|
24
|
+
key = var.to_s[1..-1]
|
25
|
+
key = key.to_sym unless use_strings_as_keys
|
26
|
+
hash[key] = obj.instance_variable_get(var)
|
19
27
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
28
|
+
hash
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the model having removed non bson types (for use with MongoDB).
|
32
|
+
#
|
33
|
+
# @param model_hash [Hash] The model Hash to process.
|
34
|
+
# @return [Hash] The model Hash with non bson types removed.
|
35
|
+
def self.remove_non_bson_types(model_hash)
|
36
|
+
model_hash.reject do |k, v|
|
37
|
+
not v.respond_to? :bson_type
|
29
38
|
end
|
39
|
+
end
|
30
40
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
return sentence if sentence_limit == 0
|
41
|
+
# An improved :each method which accepts both singleton and Enumerable
|
42
|
+
# objects (as opposed to just an Enumerable object).
|
43
|
+
#
|
44
|
+
# @yield [el] Gives each element of obj_or_objects if it's Enumerable,
|
45
|
+
# otherwise obj_or_objs itself is given.
|
46
|
+
def self.each(obj_or_objs)
|
47
|
+
if obj_or_objs.respond_to?(:each)
|
48
|
+
obj_or_objs.each { |obj| yield(obj) }
|
49
|
+
else
|
50
|
+
yield(obj_or_objs)
|
51
|
+
end
|
52
|
+
end
|
44
53
|
|
45
|
-
|
46
|
-
|
54
|
+
# Formats the sentence (modifies the receiver) and returns its value.
|
55
|
+
# The formatting is essentially to shorten the sentence and ensure that
|
56
|
+
# the index is present somewhere in the sentence. Used for search query
|
57
|
+
# results.
|
58
|
+
#
|
59
|
+
# @param sentence [String] The sentence to be formatted.
|
60
|
+
# @param index [Integer] The first index of a word in sentence. This is
|
61
|
+
# usually a word in a search query.
|
62
|
+
# @param sentence_limit [Integer] The max length of the formatted sentence
|
63
|
+
# being returned. The length will be based on the sentence_limit
|
64
|
+
# parameter or the full length of the original sentence, which ever
|
65
|
+
# is less. The full sentence is returned if the sentence_limit is 0.
|
66
|
+
# @return [String] The sentence once formatted.
|
67
|
+
def self.format_sentence_length(sentence, index, sentence_limit)
|
68
|
+
raise "A sentence value must be provided" if sentence.empty?
|
69
|
+
raise "The sentence length value must be even" if sentence_limit.odd?
|
70
|
+
if index < 0 or index > sentence.length
|
71
|
+
raise "Incorrect index value: #{index}"
|
72
|
+
end
|
73
|
+
|
74
|
+
return sentence if sentence_limit == 0
|
47
75
|
|
48
|
-
|
49
|
-
|
50
|
-
finish = index + (sentence_limit / 2)
|
76
|
+
start = 0
|
77
|
+
finish = sentence.length
|
51
78
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
finish = sentence.length
|
56
|
-
else
|
57
|
-
finish += diff
|
58
|
-
end
|
59
|
-
start = 0
|
60
|
-
elsif finish > sentence.length
|
61
|
-
diff = finish - sentence.length
|
62
|
-
if (start - diff) < 0
|
63
|
-
start = 0
|
64
|
-
else
|
65
|
-
start -= diff
|
66
|
-
end
|
67
|
-
finish = sentence.length
|
68
|
-
end
|
79
|
+
if sentence.length > sentence_limit
|
80
|
+
start = index - (sentence_limit / 2)
|
81
|
+
finish = index + (sentence_limit / 2)
|
69
82
|
|
70
|
-
|
83
|
+
if start < 0
|
84
|
+
diff = 0 - start
|
85
|
+
if (finish + diff) > sentence.length
|
86
|
+
finish = sentence.length
|
87
|
+
else
|
88
|
+
finish += diff
|
71
89
|
end
|
90
|
+
start = 0
|
91
|
+
elsif finish > sentence.length
|
92
|
+
diff = finish - sentence.length
|
93
|
+
if (start - diff) < 0
|
94
|
+
start = 0
|
95
|
+
else
|
96
|
+
start -= diff
|
97
|
+
end
|
98
|
+
finish = sentence.length
|
99
|
+
end
|
72
100
|
|
73
|
-
|
101
|
+
raise if sentence[start..(finish - 1)].length != sentence_limit
|
74
102
|
end
|
75
103
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
104
|
+
sentence.replace(sentence[start..(finish - 1)])
|
105
|
+
end
|
106
|
+
|
107
|
+
# Prints out the search results in a search engine like format.
|
108
|
+
# Most of the params are passed to Wgit::Document#search; see the docs.
|
109
|
+
# The format for each result looks like:
|
110
|
+
#
|
111
|
+
# Title
|
112
|
+
#
|
113
|
+
# Keywords (if there are some)
|
114
|
+
#
|
115
|
+
# Text Snippet (showing the searched for query if provided)
|
116
|
+
#
|
117
|
+
# URL
|
118
|
+
#
|
119
|
+
# <empty_line_seperator>
|
120
|
+
#
|
121
|
+
# @param results [Array<Wgit::Document>] An Array whose
|
122
|
+
# Wgit::Documents#text matches the query at least once.
|
123
|
+
# @param query [String] The text query to search for.
|
124
|
+
# @param case_sensitive [Boolean] Whether or not the search should be
|
125
|
+
# case sensitive or not.
|
126
|
+
# @param sentence_length [Integer] The length of the matching text of the
|
127
|
+
# search results to be outputted to the stream.
|
128
|
+
# @param keyword_count [Integer] The max amount of keywords to be
|
129
|
+
# outputted to the stream.
|
130
|
+
# @param stream [#puts] Any object that respond_to? :puts. It is used
|
131
|
+
# to output text somewhere e.g. STDOUT (the default).
|
132
|
+
# @return [nil]
|
133
|
+
def self.printf_search_results(results, query = nil, case_sensitive = false,
|
134
|
+
sentence_length = 80, keyword_count = 5,
|
135
|
+
stream = Kernel)
|
136
|
+
raise "stream must respond_to? :puts" unless stream.respond_to? :puts
|
137
|
+
keyword_count -= 1 # Because Array's are zero indexed.
|
138
|
+
|
139
|
+
results.each do |doc|
|
140
|
+
sentence = if query.nil?
|
141
|
+
nil
|
142
|
+
else
|
143
|
+
sentence = doc.search(query, sentence_length).first
|
144
|
+
if sentence.nil?
|
145
|
+
nil
|
146
|
+
else
|
147
|
+
sentence.strip.empty? ? nil : sentence
|
148
|
+
end
|
149
|
+
end
|
150
|
+
stream.puts doc.title
|
151
|
+
unless doc.keywords.nil? || doc.keywords.empty?
|
152
|
+
stream.puts doc.keywords[0..keyword_count].join(", ")
|
153
|
+
end
|
154
|
+
stream.puts sentence unless sentence.nil?
|
155
|
+
stream.puts doc.url
|
156
|
+
stream.puts
|
113
157
|
end
|
158
|
+
|
159
|
+
nil
|
160
|
+
end
|
114
161
|
end
|
115
162
|
end
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -15,8 +15,8 @@ description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves thei
|
|
15
15
|
indexed documents stored in a database. Therefore this library provides the main
|
16
16
|
components of a WWW search engine. You can also use Wgit to copy entire website's
|
17
17
|
HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
|
18
|
-
you to easily pull out the parts of a webpage that are important to you, the
|
19
|
-
or
|
18
|
+
you to easily pull out the parts of a webpage that are important to you, the external
|
19
|
+
links or keywords for example.
|
20
20
|
email: michael.telford@live.com
|
21
21
|
executables: []
|
22
22
|
extensions: []
|
@@ -30,14 +30,15 @@ files:
|
|
30
30
|
- "./lib/wgit/database/model.rb"
|
31
31
|
- "./lib/wgit/database/mongo_connection_details.rb"
|
32
32
|
- "./lib/wgit/document.rb"
|
33
|
+
- "./lib/wgit/indexer.rb"
|
33
34
|
- "./lib/wgit/url.rb"
|
34
35
|
- "./lib/wgit/utils.rb"
|
35
36
|
- "./lib/wgit/version.rb"
|
36
|
-
|
37
|
-
homepage: http://rubygems.org/gems/wgit
|
37
|
+
homepage: https://github.com/michaeltelford/wgit
|
38
38
|
licenses:
|
39
39
|
- MIT
|
40
40
|
metadata:
|
41
|
+
source_code_uri: https://github.com/michaeltelford/wgit
|
41
42
|
allowed_push_host: https://rubygems.org
|
42
43
|
post_install_message:
|
43
44
|
rdoc_options: []
|
@@ -45,9 +46,9 @@ require_paths:
|
|
45
46
|
- lib
|
46
47
|
required_ruby_version: !ruby/object:Gem::Requirement
|
47
48
|
requirements:
|
48
|
-
- - "
|
49
|
+
- - "~>"
|
49
50
|
- !ruby/object:Gem::Version
|
50
|
-
version: '
|
51
|
+
version: '2.5'
|
51
52
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
53
|
requirements:
|
53
54
|
- - ">="
|
@@ -55,8 +56,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
55
56
|
version: '0'
|
56
57
|
requirements: []
|
57
58
|
rubyforge_project:
|
58
|
-
rubygems_version: 2.
|
59
|
+
rubygems_version: 2.7.8
|
59
60
|
signing_key:
|
60
61
|
specification_version: 4
|
61
|
-
summary: Wgit is wget on steroids with an easy to use API.
|
62
|
+
summary: Wgit is wget on steroids with an easy to use API for web scraping and indexing.
|
62
63
|
test_files: []
|
data/lib/wgit/web_crawler.rb
DELETED
@@ -1,134 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require_relative 'crawler'
|
4
|
-
require_relative 'database/database'
|
5
|
-
|
6
|
-
# @author Michael Telford
|
7
|
-
module Wgit
|
8
|
-
|
9
|
-
# Convience method to crawl the World Wide Web.
|
10
|
-
# The default value (-1) for max_sites_to_crawl is unrestricted.
|
11
|
-
# The default max_data_size is 1GB.
|
12
|
-
def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
|
13
|
-
db = Wgit::Database.new
|
14
|
-
web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
|
15
|
-
web_crawler.crawl_the_web
|
16
|
-
end
|
17
|
-
|
18
|
-
# Class which sets up a crawler and saves the indexed
|
19
|
-
# docs to a database. Will crawl the web forever if you let it :-)
|
20
|
-
class WebCrawler
|
21
|
-
attr_accessor :max_sites_to_crawl, :max_data_size
|
22
|
-
attr_reader :crawler, :db
|
23
|
-
|
24
|
-
def initialize(database,
|
25
|
-
max_sites_to_crawl = -1,
|
26
|
-
max_data_size = 1048576000)
|
27
|
-
@crawler = Wgit::Crawler.new
|
28
|
-
@db = database
|
29
|
-
@max_sites_to_crawl = max_sites_to_crawl
|
30
|
-
@max_data_size = max_data_size
|
31
|
-
end
|
32
|
-
|
33
|
-
# Retrieves url's from the database and recursively crawls each site
|
34
|
-
# storing their internal pages into the database and adding their external
|
35
|
-
# url's to be crawled at a later date.
|
36
|
-
def crawl_the_web
|
37
|
-
if max_sites_to_crawl < 0
|
38
|
-
puts "Crawling until the database has been filled or it runs out of \
|
39
|
-
urls to crawl (which might be never)."
|
40
|
-
end
|
41
|
-
loop_count = 0
|
42
|
-
|
43
|
-
while keep_crawling?(loop_count) do
|
44
|
-
puts "Current database size: #{db.size}"
|
45
|
-
crawler.urls = db.uncrawled_urls
|
46
|
-
|
47
|
-
if crawler.urls.empty?
|
48
|
-
puts "No urls to crawl, exiting."
|
49
|
-
break
|
50
|
-
end
|
51
|
-
puts "Starting crawl loop for: #{crawler.urls}"
|
52
|
-
|
53
|
-
docs_count = 0
|
54
|
-
urls_count = 0
|
55
|
-
|
56
|
-
crawler.urls.each do |url|
|
57
|
-
unless keep_crawling?(loop_count)
|
58
|
-
puts "Reached max number of sites to crawl or database \
|
59
|
-
capacity, exiting."
|
60
|
-
return
|
61
|
-
end
|
62
|
-
loop_count += 1
|
63
|
-
|
64
|
-
url.crawled = true
|
65
|
-
raise unless db.update(url) == 1
|
66
|
-
|
67
|
-
site_docs_count = 0
|
68
|
-
ext_links = crawler.crawl_site(url) do |doc|
|
69
|
-
unless doc.empty?
|
70
|
-
if write_doc_to_db(doc)
|
71
|
-
docs_count += 1
|
72
|
-
site_docs_count += 1
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
urls_count += write_urls_to_db(ext_links)
|
78
|
-
puts "Crawled and saved #{site_docs_count} docs for the \
|
79
|
-
site: #{url}"
|
80
|
-
end
|
81
|
-
|
82
|
-
puts "Crawled and saved docs for #{docs_count} url(s) overall for \
|
83
|
-
this iteration."
|
84
|
-
puts "Found and saved #{urls_count} external url(s) for the next \
|
85
|
-
iteration."
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
private
|
90
|
-
|
91
|
-
# Keep crawling or not based on DB size and current loop interation.
|
92
|
-
def keep_crawling?(loop_count)
|
93
|
-
return false if db.size >= max_data_size
|
94
|
-
# If max_sites_to_crawl is -1 for example then crawl away.
|
95
|
-
if max_sites_to_crawl < 0
|
96
|
-
true
|
97
|
-
else
|
98
|
-
loop_count < max_sites_to_crawl
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
# The unique url index on the documents collection prevents duplicate
|
103
|
-
# inserts.
|
104
|
-
def write_doc_to_db(doc)
|
105
|
-
db.insert(doc)
|
106
|
-
puts "Saved document for url: #{doc.url}"
|
107
|
-
true
|
108
|
-
rescue Mongo::Error::OperationFailure
|
109
|
-
puts "Document already exists: #{doc.url}"
|
110
|
-
false
|
111
|
-
end
|
112
|
-
|
113
|
-
# The unique url index on the urls collection prevents duplicate inserts.
|
114
|
-
def write_urls_to_db(urls)
|
115
|
-
count = 0
|
116
|
-
if urls.respond_to?(:each)
|
117
|
-
urls.each do |url|
|
118
|
-
begin
|
119
|
-
db.insert(url)
|
120
|
-
count += 1
|
121
|
-
puts "Inserted url: #{url}"
|
122
|
-
rescue Mongo::Error::OperationFailure
|
123
|
-
puts "Url already exists: #{url}"
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
|
-
count
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
if __FILE__ == $0
|
133
|
-
Wgit.crawl_the_web
|
134
|
-
end
|