wgit 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,115 +1,162 @@
1
-
2
1
  module Wgit
3
2
 
4
- # @author Michael Telford
5
3
  # Utility module containing generic methods.
6
4
  module Utils
7
- def self.time_stamp
8
- Time.new
9
- end
5
+
6
+ # Returns the current time stamp.
7
+ #
8
+ # @return [Time] The current time stamp.
9
+ def self.time_stamp
10
+ Time.new
11
+ end
10
12
 
11
- # Returns a hash created from obj's instance vars and values.
12
- def self.to_h(obj, ignore = [])
13
- hash = {}
14
- obj.instance_variables.each do |var|
15
- next if ignore.include?(var)
16
- hash[var[1..-1].to_sym] = obj.instance_variable_get(var)
17
- end
18
- hash
13
+ # Returns a Hash created from obj's instance vars and values.
14
+ #
15
+ # @param obj [Object] The object to process.
16
+ # @param ignore [Array<String>] Attributes to ignore.
17
+ # @param use_strings_as_keys [Boolean] Whether or not to use strings as
18
+ # the keys in the returned Hash. Symbols are used otherwise.
19
+ # @return [Hash] A Hash created from obj's instance vars and values.
20
+ def self.to_h(obj, ignore = [], use_strings_as_keys = true)
21
+ hash = {}
22
+ obj.instance_variables.each do |var|
23
+ next if ignore.include?(var.to_s)
24
+ key = var.to_s[1..-1]
25
+ key = key.to_sym unless use_strings_as_keys
26
+ hash[key] = obj.instance_variable_get(var)
19
27
  end
20
-
21
- # Improved each method which takes care of singleton and enumerable
22
- # objects. Yields one or more objects.
23
- def self.each(obj_or_objs)
24
- if obj_or_objs.respond_to?(:each)
25
- obj_or_objs.each { |obj| yield obj }
26
- else
27
- yield obj_or_objs
28
- end
28
+ hash
29
+ end
30
+
31
+ # Returns the model having removed non bson types (for use with MongoDB).
32
+ #
33
+ # @param model_hash [Hash] The model Hash to process.
34
+ # @return [Hash] The model Hash with non bson types removed.
35
+ def self.remove_non_bson_types(model_hash)
36
+ model_hash.reject do |k, v|
37
+ not v.respond_to? :bson_type
29
38
  end
39
+ end
30
40
 
31
- # Formats the sentence (modifies the receiver) and returns its value.
32
- # The length will be based on the sentence_limit parameter or the full
33
- # length of the original sentence, which ever is less. The full sentence
34
- # is returned if the sentence_limit is 0. The algorithm obviously ensures
35
- # that the search value is visible somewhere in the sentence.
36
- def self.format_sentence_length(sentence, index, sentence_limit)
37
- raise "A sentence value must be provided" if sentence.empty?
38
- raise "The sentence length value must be even" if sentence_limit.odd?
39
- if index < 0 or index > sentence.length
40
- raise "Incorrect index value: #{index}"
41
- end
42
-
43
- return sentence if sentence_limit == 0
41
+ # An improved :each method which accepts both singleton and Enumerable
42
+ # objects (as opposed to just an Enumerable object).
43
+ #
44
+ # @yield [el] Gives each element of obj_or_objects if it's Enumerable,
45
+ # otherwise obj_or_objs itself is given.
46
+ def self.each(obj_or_objs)
47
+ if obj_or_objs.respond_to?(:each)
48
+ obj_or_objs.each { |obj| yield(obj) }
49
+ else
50
+ yield(obj_or_objs)
51
+ end
52
+ end
44
53
 
45
- start = 0
46
- finish = sentence.length
54
+ # Formats the sentence (modifies the receiver) and returns its value.
55
+ # The formatting is essentially to shorten the sentence and ensure that
56
+ # the index is present somewhere in the sentence. Used for search query
57
+ # results.
58
+ #
59
+ # @param sentence [String] The sentence to be formatted.
60
+ # @param index [Integer] The first index of a word in sentence. This is
61
+ # usually a word in a search query.
62
+ # @param sentence_limit [Integer] The max length of the formatted sentence
63
+ # being returned. The length will be based on the sentence_limit
64
+ # parameter or the full length of the original sentence, which ever
65
+ # is less. The full sentence is returned if the sentence_limit is 0.
66
+ # @return [String] The sentence once formatted.
67
+ def self.format_sentence_length(sentence, index, sentence_limit)
68
+ raise "A sentence value must be provided" if sentence.empty?
69
+ raise "The sentence length value must be even" if sentence_limit.odd?
70
+ if index < 0 or index > sentence.length
71
+ raise "Incorrect index value: #{index}"
72
+ end
73
+
74
+ return sentence if sentence_limit == 0
47
75
 
48
- if sentence.length > sentence_limit
49
- start = index - (sentence_limit / 2)
50
- finish = index + (sentence_limit / 2)
76
+ start = 0
77
+ finish = sentence.length
51
78
 
52
- if start < 0
53
- diff = 0 - start
54
- if (finish + diff) > sentence.length
55
- finish = sentence.length
56
- else
57
- finish += diff
58
- end
59
- start = 0
60
- elsif finish > sentence.length
61
- diff = finish - sentence.length
62
- if (start - diff) < 0
63
- start = 0
64
- else
65
- start -= diff
66
- end
67
- finish = sentence.length
68
- end
79
+ if sentence.length > sentence_limit
80
+ start = index - (sentence_limit / 2)
81
+ finish = index + (sentence_limit / 2)
69
82
 
70
- raise if sentence[start..(finish - 1)].length != sentence_limit
83
+ if start < 0
84
+ diff = 0 - start
85
+ if (finish + diff) > sentence.length
86
+ finish = sentence.length
87
+ else
88
+ finish += diff
71
89
  end
90
+ start = 0
91
+ elsif finish > sentence.length
92
+ diff = finish - sentence.length
93
+ if (start - diff) < 0
94
+ start = 0
95
+ else
96
+ start -= diff
97
+ end
98
+ finish = sentence.length
99
+ end
72
100
 
73
- sentence.replace(sentence[start..(finish - 1)])
101
+ raise if sentence[start..(finish - 1)].length != sentence_limit
74
102
  end
75
103
 
76
- # Prints out the search results in a search engine page format.
77
- # Most of the params are passed to Document#search - see class docs.
78
- # The steam param decides where the printf output is written to, and
79
- # therefore must respond_to? :puts
80
- # The format for each result is:
81
- #
82
- # Title
83
- # Keywords (if there are some)
84
- # Text Snippet (showing the searched for text if provided)
85
- # Url
86
- # <empty_line>
87
- def self.printf_search_results(results, text = nil, case_sensitive = false,
88
- sentence_length = 80, keyword_count = 5,
89
- stream = Kernel)
90
- raise "stream must respond_to? :puts" unless stream.respond_to? :puts
91
- keyword_count -= 1 # Because Array's are zero indexed.
92
-
93
- results.each do |doc|
94
- sentence = if text.nil?
95
- nil
96
- else
97
- sentence = doc.search(text, sentence_length).first
98
- if sentence.nil?
99
- nil
100
- else
101
- sentence.strip.empty? ? nil : sentence
102
- end
103
- end
104
- stream.puts doc.title
105
- unless doc.keywords.empty?
106
- stream.puts doc.keywords[0..keyword_count].join(", ")
107
- end
108
- stream.puts sentence unless sentence.nil?
109
- stream.puts doc.url
110
- stream.puts
111
- end
112
- nil
104
+ sentence.replace(sentence[start..(finish - 1)])
105
+ end
106
+
107
+ # Prints out the search results in a search engine like format.
108
+ # Most of the params are passed to Wgit::Document#search; see the docs.
109
+ # The format for each result looks like:
110
+ #
111
+ # Title
112
+ #
113
+ # Keywords (if there are some)
114
+ #
115
+ # Text Snippet (showing the searched for query if provided)
116
+ #
117
+ # URL
118
+ #
119
+ # <empty_line_seperator>
120
+ #
121
+ # @param results [Array<Wgit::Document>] An Array whose
122
+ # Wgit::Documents#text matches the query at least once.
123
+ # @param query [String] The text query to search for.
124
+ # @param case_sensitive [Boolean] Whether or not the search should be
125
+ # case sensitive or not.
126
+ # @param sentence_length [Integer] The length of the matching text of the
127
+ # search results to be outputted to the stream.
128
+ # @param keyword_count [Integer] The max amount of keywords to be
129
+ # outputted to the stream.
130
+ # @param stream [#puts] Any object that respond_to? :puts. It is used
131
+ # to output text somewhere e.g. STDOUT (the default).
132
+ # @return [nil]
133
+ def self.printf_search_results(results, query = nil, case_sensitive = false,
134
+ sentence_length = 80, keyword_count = 5,
135
+ stream = Kernel)
136
+ raise "stream must respond_to? :puts" unless stream.respond_to? :puts
137
+ keyword_count -= 1 # Because Array's are zero indexed.
138
+
139
+ results.each do |doc|
140
+ sentence = if query.nil?
141
+ nil
142
+ else
143
+ sentence = doc.search(query, sentence_length).first
144
+ if sentence.nil?
145
+ nil
146
+ else
147
+ sentence.strip.empty? ? nil : sentence
148
+ end
149
+ end
150
+ stream.puts doc.title
151
+ unless doc.keywords.nil? || doc.keywords.empty?
152
+ stream.puts doc.keywords[0..keyword_count].join(", ")
153
+ end
154
+ stream.puts sentence unless sentence.nil?
155
+ stream.puts doc.url
156
+ stream.puts
113
157
  end
158
+
159
+ nil
160
+ end
114
161
  end
115
162
  end
@@ -1,3 +1,7 @@
1
+ # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
2
+ # contents for later use.
3
+ # @author Michael Telford
1
4
  module Wgit
2
- VERSION = "0.0.1".freeze
5
+ # The current gem version of Wgit.
6
+ VERSION = "0.0.2".freeze
3
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
@@ -15,8 +15,8 @@ description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves thei
15
15
  indexed documents stored in a database. Therefore this library provides the main
16
16
  components of a WWW search engine. You can also use Wgit to copy entire website's
17
17
  HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
18
- you to easily pull out the parts of a webpage that are important to you, the CSS
19
- or JS links for example.
18
+ you to easily pull out the parts of a webpage that are important to you, the external
19
+ links or keywords for example.
20
20
  email: michael.telford@live.com
21
21
  executables: []
22
22
  extensions: []
@@ -30,14 +30,15 @@ files:
30
30
  - "./lib/wgit/database/model.rb"
31
31
  - "./lib/wgit/database/mongo_connection_details.rb"
32
32
  - "./lib/wgit/document.rb"
33
+ - "./lib/wgit/indexer.rb"
33
34
  - "./lib/wgit/url.rb"
34
35
  - "./lib/wgit/utils.rb"
35
36
  - "./lib/wgit/version.rb"
36
- - "./lib/wgit/web_crawler.rb"
37
- homepage: http://rubygems.org/gems/wgit
37
+ homepage: https://github.com/michaeltelford/wgit
38
38
  licenses:
39
39
  - MIT
40
40
  metadata:
41
+ source_code_uri: https://github.com/michaeltelford/wgit
41
42
  allowed_push_host: https://rubygems.org
42
43
  post_install_message:
43
44
  rdoc_options: []
@@ -45,9 +46,9 @@ require_paths:
45
46
  - lib
46
47
  required_ruby_version: !ruby/object:Gem::Requirement
47
48
  requirements:
48
- - - ">="
49
+ - - "~>"
49
50
  - !ruby/object:Gem::Version
50
- version: '0'
51
+ version: '2.5'
51
52
  required_rubygems_version: !ruby/object:Gem::Requirement
52
53
  requirements:
53
54
  - - ">="
@@ -55,8 +56,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
55
56
  version: '0'
56
57
  requirements: []
57
58
  rubyforge_project:
58
- rubygems_version: 2.4.5
59
+ rubygems_version: 2.7.8
59
60
  signing_key:
60
61
  specification_version: 4
61
- summary: Wgit is wget on steroids with an easy to use API.
62
+ summary: Wgit is wget on steroids with an easy to use API for web scraping and indexing.
62
63
  test_files: []
@@ -1,134 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative 'crawler'
4
- require_relative 'database/database'
5
-
6
- # @author Michael Telford
7
- module Wgit
8
-
9
- # Convience method to crawl the World Wide Web.
10
- # The default value (-1) for max_sites_to_crawl is unrestricted.
11
- # The default max_data_size is 1GB.
12
- def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
13
- db = Wgit::Database.new
14
- web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
15
- web_crawler.crawl_the_web
16
- end
17
-
18
- # Class which sets up a crawler and saves the indexed
19
- # docs to a database. Will crawl the web forever if you let it :-)
20
- class WebCrawler
21
- attr_accessor :max_sites_to_crawl, :max_data_size
22
- attr_reader :crawler, :db
23
-
24
- def initialize(database,
25
- max_sites_to_crawl = -1,
26
- max_data_size = 1048576000)
27
- @crawler = Wgit::Crawler.new
28
- @db = database
29
- @max_sites_to_crawl = max_sites_to_crawl
30
- @max_data_size = max_data_size
31
- end
32
-
33
- # Retrieves url's from the database and recursively crawls each site
34
- # storing their internal pages into the database and adding their external
35
- # url's to be crawled at a later date.
36
- def crawl_the_web
37
- if max_sites_to_crawl < 0
38
- puts "Crawling until the database has been filled or it runs out of \
39
- urls to crawl (which might be never)."
40
- end
41
- loop_count = 0
42
-
43
- while keep_crawling?(loop_count) do
44
- puts "Current database size: #{db.size}"
45
- crawler.urls = db.uncrawled_urls
46
-
47
- if crawler.urls.empty?
48
- puts "No urls to crawl, exiting."
49
- break
50
- end
51
- puts "Starting crawl loop for: #{crawler.urls}"
52
-
53
- docs_count = 0
54
- urls_count = 0
55
-
56
- crawler.urls.each do |url|
57
- unless keep_crawling?(loop_count)
58
- puts "Reached max number of sites to crawl or database \
59
- capacity, exiting."
60
- return
61
- end
62
- loop_count += 1
63
-
64
- url.crawled = true
65
- raise unless db.update(url) == 1
66
-
67
- site_docs_count = 0
68
- ext_links = crawler.crawl_site(url) do |doc|
69
- unless doc.empty?
70
- if write_doc_to_db(doc)
71
- docs_count += 1
72
- site_docs_count += 1
73
- end
74
- end
75
- end
76
-
77
- urls_count += write_urls_to_db(ext_links)
78
- puts "Crawled and saved #{site_docs_count} docs for the \
79
- site: #{url}"
80
- end
81
-
82
- puts "Crawled and saved docs for #{docs_count} url(s) overall for \
83
- this iteration."
84
- puts "Found and saved #{urls_count} external url(s) for the next \
85
- iteration."
86
- end
87
- end
88
-
89
- private
90
-
91
- # Keep crawling or not based on DB size and current loop interation.
92
- def keep_crawling?(loop_count)
93
- return false if db.size >= max_data_size
94
- # If max_sites_to_crawl is -1 for example then crawl away.
95
- if max_sites_to_crawl < 0
96
- true
97
- else
98
- loop_count < max_sites_to_crawl
99
- end
100
- end
101
-
102
- # The unique url index on the documents collection prevents duplicate
103
- # inserts.
104
- def write_doc_to_db(doc)
105
- db.insert(doc)
106
- puts "Saved document for url: #{doc.url}"
107
- true
108
- rescue Mongo::Error::OperationFailure
109
- puts "Document already exists: #{doc.url}"
110
- false
111
- end
112
-
113
- # The unique url index on the urls collection prevents duplicate inserts.
114
- def write_urls_to_db(urls)
115
- count = 0
116
- if urls.respond_to?(:each)
117
- urls.each do |url|
118
- begin
119
- db.insert(url)
120
- count += 1
121
- puts "Inserted url: #{url}"
122
- rescue Mongo::Error::OperationFailure
123
- puts "Url already exists: #{url}"
124
- end
125
- end
126
- end
127
- count
128
- end
129
- end
130
- end
131
-
132
- if __FILE__ == $0
133
- Wgit.crawl_the_web
134
- end