wgit 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,115 +1,162 @@
1
-
2
1
  module Wgit
3
2
 
4
- # @author Michael Telford
5
3
  # Utility module containing generic methods.
6
4
  module Utils
7
- def self.time_stamp
8
- Time.new
9
- end
5
+
6
+ # Returns the current time stamp.
7
+ #
8
+ # @return [Time] The current time stamp.
9
+ def self.time_stamp
10
+ Time.new
11
+ end
10
12
 
11
- # Returns a hash created from obj's instance vars and values.
12
- def self.to_h(obj, ignore = [])
13
- hash = {}
14
- obj.instance_variables.each do |var|
15
- next if ignore.include?(var)
16
- hash[var[1..-1].to_sym] = obj.instance_variable_get(var)
17
- end
18
- hash
13
+ # Returns a Hash created from obj's instance vars and values.
14
+ #
15
+ # @param obj [Object] The object to process.
16
+ # @param ignore [Array<String>] Attributes to ignore.
17
+ # @param use_strings_as_keys [Boolean] Whether or not to use strings as
18
+ # the keys in the returned Hash. Symbols are used otherwise.
19
+ # @return [Hash] A Hash created from obj's instance vars and values.
20
+ def self.to_h(obj, ignore = [], use_strings_as_keys = true)
21
+ hash = {}
22
+ obj.instance_variables.each do |var|
23
+ next if ignore.include?(var.to_s)
24
+ key = var.to_s[1..-1]
25
+ key = key.to_sym unless use_strings_as_keys
26
+ hash[key] = obj.instance_variable_get(var)
19
27
  end
20
-
21
- # Improved each method which takes care of singleton and enumerable
22
- # objects. Yields one or more objects.
23
- def self.each(obj_or_objs)
24
- if obj_or_objs.respond_to?(:each)
25
- obj_or_objs.each { |obj| yield obj }
26
- else
27
- yield obj_or_objs
28
- end
28
+ hash
29
+ end
30
+
31
+ # Returns the model having removed non bson types (for use with MongoDB).
32
+ #
33
+ # @param model_hash [Hash] The model Hash to process.
34
+ # @return [Hash] The model Hash with non bson types removed.
35
+ def self.remove_non_bson_types(model_hash)
36
+ model_hash.reject do |k, v|
37
+ not v.respond_to? :bson_type
29
38
  end
39
+ end
30
40
 
31
- # Formats the sentence (modifies the receiver) and returns its value.
32
- # The length will be based on the sentence_limit parameter or the full
33
- # length of the original sentence, which ever is less. The full sentence
34
- # is returned if the sentence_limit is 0. The algorithm obviously ensures
35
- # that the search value is visible somewhere in the sentence.
36
- def self.format_sentence_length(sentence, index, sentence_limit)
37
- raise "A sentence value must be provided" if sentence.empty?
38
- raise "The sentence length value must be even" if sentence_limit.odd?
39
- if index < 0 or index > sentence.length
40
- raise "Incorrect index value: #{index}"
41
- end
42
-
43
- return sentence if sentence_limit == 0
41
+ # An improved :each method which accepts both singleton and Enumerable
42
+ # objects (as opposed to just an Enumerable object).
43
+ #
44
+ # @yield [el] Gives each element of obj_or_objects if it's Enumerable,
45
+ # otherwise obj_or_objs itself is given.
46
+ def self.each(obj_or_objs)
47
+ if obj_or_objs.respond_to?(:each)
48
+ obj_or_objs.each { |obj| yield(obj) }
49
+ else
50
+ yield(obj_or_objs)
51
+ end
52
+ end
44
53
 
45
- start = 0
46
- finish = sentence.length
54
+ # Formats the sentence (modifies the receiver) and returns its value.
55
+ # The formatting is essentially to shorten the sentence and ensure that
56
+ # the index is present somewhere in the sentence. Used for search query
57
+ # results.
58
+ #
59
+ # @param sentence [String] The sentence to be formatted.
60
+ # @param index [Integer] The first index of a word in sentence. This is
61
+ # usually a word in a search query.
62
+ # @param sentence_limit [Integer] The max length of the formatted sentence
63
+ # being returned. The length will be based on the sentence_limit
64
+ # parameter or the full length of the original sentence, which ever
65
+ # is less. The full sentence is returned if the sentence_limit is 0.
66
+ # @return [String] The sentence once formatted.
67
+ def self.format_sentence_length(sentence, index, sentence_limit)
68
+ raise "A sentence value must be provided" if sentence.empty?
69
+ raise "The sentence length value must be even" if sentence_limit.odd?
70
+ if index < 0 or index > sentence.length
71
+ raise "Incorrect index value: #{index}"
72
+ end
73
+
74
+ return sentence if sentence_limit == 0
47
75
 
48
- if sentence.length > sentence_limit
49
- start = index - (sentence_limit / 2)
50
- finish = index + (sentence_limit / 2)
76
+ start = 0
77
+ finish = sentence.length
51
78
 
52
- if start < 0
53
- diff = 0 - start
54
- if (finish + diff) > sentence.length
55
- finish = sentence.length
56
- else
57
- finish += diff
58
- end
59
- start = 0
60
- elsif finish > sentence.length
61
- diff = finish - sentence.length
62
- if (start - diff) < 0
63
- start = 0
64
- else
65
- start -= diff
66
- end
67
- finish = sentence.length
68
- end
79
+ if sentence.length > sentence_limit
80
+ start = index - (sentence_limit / 2)
81
+ finish = index + (sentence_limit / 2)
69
82
 
70
- raise if sentence[start..(finish - 1)].length != sentence_limit
83
+ if start < 0
84
+ diff = 0 - start
85
+ if (finish + diff) > sentence.length
86
+ finish = sentence.length
87
+ else
88
+ finish += diff
71
89
  end
90
+ start = 0
91
+ elsif finish > sentence.length
92
+ diff = finish - sentence.length
93
+ if (start - diff) < 0
94
+ start = 0
95
+ else
96
+ start -= diff
97
+ end
98
+ finish = sentence.length
99
+ end
72
100
 
73
- sentence.replace(sentence[start..(finish - 1)])
101
+ raise if sentence[start..(finish - 1)].length != sentence_limit
74
102
  end
75
103
 
76
- # Prints out the search results in a search engine page format.
77
- # Most of the params are passed to Document#search - see class docs.
78
- # The steam param decides where the printf output is written to, and
79
- # therefore must respond_to? :puts
80
- # The format for each result is:
81
- #
82
- # Title
83
- # Keywords (if there are some)
84
- # Text Snippet (showing the searched for text if provided)
85
- # Url
86
- # <empty_line>
87
- def self.printf_search_results(results, text = nil, case_sensitive = false,
88
- sentence_length = 80, keyword_count = 5,
89
- stream = Kernel)
90
- raise "stream must respond_to? :puts" unless stream.respond_to? :puts
91
- keyword_count -= 1 # Because Array's are zero indexed.
92
-
93
- results.each do |doc|
94
- sentence = if text.nil?
95
- nil
96
- else
97
- sentence = doc.search(text, sentence_length).first
98
- if sentence.nil?
99
- nil
100
- else
101
- sentence.strip.empty? ? nil : sentence
102
- end
103
- end
104
- stream.puts doc.title
105
- unless doc.keywords.empty?
106
- stream.puts doc.keywords[0..keyword_count].join(", ")
107
- end
108
- stream.puts sentence unless sentence.nil?
109
- stream.puts doc.url
110
- stream.puts
111
- end
112
- nil
104
+ sentence.replace(sentence[start..(finish - 1)])
105
+ end
106
+
107
+ # Prints out the search results in a search engine like format.
108
+ # Most of the params are passed to Wgit::Document#search; see the docs.
109
+ # The format for each result looks like:
110
+ #
111
+ # Title
112
+ #
113
+ # Keywords (if there are some)
114
+ #
115
+ # Text Snippet (showing the searched for query if provided)
116
+ #
117
+ # URL
118
+ #
119
+ # <empty_line_seperator>
120
+ #
121
+ # @param results [Array<Wgit::Document>] An Array whose
122
+ # Wgit::Documents#text matches the query at least once.
123
+ # @param query [String] The text query to search for.
124
+ # @param case_sensitive [Boolean] Whether or not the search should be
125
+ # case sensitive or not.
126
+ # @param sentence_length [Integer] The length of the matching text of the
127
+ # search results to be outputted to the stream.
128
+ # @param keyword_count [Integer] The max amount of keywords to be
129
+ # outputted to the stream.
130
+ # @param stream [#puts] Any object that respond_to? :puts. It is used
131
+ # to output text somewhere e.g. STDOUT (the default).
132
+ # @return [nil]
133
+ def self.printf_search_results(results, query = nil, case_sensitive = false,
134
+ sentence_length = 80, keyword_count = 5,
135
+ stream = Kernel)
136
+ raise "stream must respond_to? :puts" unless stream.respond_to? :puts
137
+ keyword_count -= 1 # Because Array's are zero indexed.
138
+
139
+ results.each do |doc|
140
+ sentence = if query.nil?
141
+ nil
142
+ else
143
+ sentence = doc.search(query, sentence_length).first
144
+ if sentence.nil?
145
+ nil
146
+ else
147
+ sentence.strip.empty? ? nil : sentence
148
+ end
149
+ end
150
+ stream.puts doc.title
151
+ unless doc.keywords.nil? || doc.keywords.empty?
152
+ stream.puts doc.keywords[0..keyword_count].join(", ")
153
+ end
154
+ stream.puts sentence unless sentence.nil?
155
+ stream.puts doc.url
156
+ stream.puts
113
157
  end
158
+
159
+ nil
160
+ end
114
161
  end
115
162
  end
@@ -1,3 +1,7 @@
1
+ # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
2
+ # contents for later use.
3
+ # @author Michael Telford
1
4
  module Wgit
2
- VERSION = "0.0.1".freeze
5
+ # The current gem version of Wgit.
6
+ VERSION = "0.0.2".freeze
3
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
@@ -15,8 +15,8 @@ description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves thei
15
15
  indexed documents stored in a database. Therefore this library provides the main
16
16
  components of a WWW search engine. You can also use Wgit to copy entire website's
17
17
  HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
18
- you to easily pull out the parts of a webpage that are important to you, the CSS
19
- or JS links for example.
18
+ you to easily pull out the parts of a webpage that are important to you, the external
19
+ links or keywords for example.
20
20
  email: michael.telford@live.com
21
21
  executables: []
22
22
  extensions: []
@@ -30,14 +30,15 @@ files:
30
30
  - "./lib/wgit/database/model.rb"
31
31
  - "./lib/wgit/database/mongo_connection_details.rb"
32
32
  - "./lib/wgit/document.rb"
33
+ - "./lib/wgit/indexer.rb"
33
34
  - "./lib/wgit/url.rb"
34
35
  - "./lib/wgit/utils.rb"
35
36
  - "./lib/wgit/version.rb"
36
- - "./lib/wgit/web_crawler.rb"
37
- homepage: http://rubygems.org/gems/wgit
37
+ homepage: https://github.com/michaeltelford/wgit
38
38
  licenses:
39
39
  - MIT
40
40
  metadata:
41
+ source_code_uri: https://github.com/michaeltelford/wgit
41
42
  allowed_push_host: https://rubygems.org
42
43
  post_install_message:
43
44
  rdoc_options: []
@@ -45,9 +46,9 @@ require_paths:
45
46
  - lib
46
47
  required_ruby_version: !ruby/object:Gem::Requirement
47
48
  requirements:
48
- - - ">="
49
+ - - "~>"
49
50
  - !ruby/object:Gem::Version
50
- version: '0'
51
+ version: '2.5'
51
52
  required_rubygems_version: !ruby/object:Gem::Requirement
52
53
  requirements:
53
54
  - - ">="
@@ -55,8 +56,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
55
56
  version: '0'
56
57
  requirements: []
57
58
  rubyforge_project:
58
- rubygems_version: 2.4.5
59
+ rubygems_version: 2.7.8
59
60
  signing_key:
60
61
  specification_version: 4
61
- summary: Wgit is wget on steroids with an easy to use API.
62
+ summary: Wgit is wget on steroids with an easy to use API for web scraping and indexing.
62
63
  test_files: []
@@ -1,134 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative 'crawler'
4
- require_relative 'database/database'
5
-
6
- # @author Michael Telford
7
- module Wgit
8
-
9
- # Convience method to crawl the World Wide Web.
10
- # The default value (-1) for max_sites_to_crawl is unrestricted.
11
- # The default max_data_size is 1GB.
12
- def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
13
- db = Wgit::Database.new
14
- web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
15
- web_crawler.crawl_the_web
16
- end
17
-
18
- # Class which sets up a crawler and saves the indexed
19
- # docs to a database. Will crawl the web forever if you let it :-)
20
- class WebCrawler
21
- attr_accessor :max_sites_to_crawl, :max_data_size
22
- attr_reader :crawler, :db
23
-
24
- def initialize(database,
25
- max_sites_to_crawl = -1,
26
- max_data_size = 1048576000)
27
- @crawler = Wgit::Crawler.new
28
- @db = database
29
- @max_sites_to_crawl = max_sites_to_crawl
30
- @max_data_size = max_data_size
31
- end
32
-
33
- # Retrieves url's from the database and recursively crawls each site
34
- # storing their internal pages into the database and adding their external
35
- # url's to be crawled at a later date.
36
- def crawl_the_web
37
- if max_sites_to_crawl < 0
38
- puts "Crawling until the database has been filled or it runs out of \
39
- urls to crawl (which might be never)."
40
- end
41
- loop_count = 0
42
-
43
- while keep_crawling?(loop_count) do
44
- puts "Current database size: #{db.size}"
45
- crawler.urls = db.uncrawled_urls
46
-
47
- if crawler.urls.empty?
48
- puts "No urls to crawl, exiting."
49
- break
50
- end
51
- puts "Starting crawl loop for: #{crawler.urls}"
52
-
53
- docs_count = 0
54
- urls_count = 0
55
-
56
- crawler.urls.each do |url|
57
- unless keep_crawling?(loop_count)
58
- puts "Reached max number of sites to crawl or database \
59
- capacity, exiting."
60
- return
61
- end
62
- loop_count += 1
63
-
64
- url.crawled = true
65
- raise unless db.update(url) == 1
66
-
67
- site_docs_count = 0
68
- ext_links = crawler.crawl_site(url) do |doc|
69
- unless doc.empty?
70
- if write_doc_to_db(doc)
71
- docs_count += 1
72
- site_docs_count += 1
73
- end
74
- end
75
- end
76
-
77
- urls_count += write_urls_to_db(ext_links)
78
- puts "Crawled and saved #{site_docs_count} docs for the \
79
- site: #{url}"
80
- end
81
-
82
- puts "Crawled and saved docs for #{docs_count} url(s) overall for \
83
- this iteration."
84
- puts "Found and saved #{urls_count} external url(s) for the next \
85
- iteration."
86
- end
87
- end
88
-
89
- private
90
-
91
- # Keep crawling or not based on DB size and current loop interation.
92
- def keep_crawling?(loop_count)
93
- return false if db.size >= max_data_size
94
- # If max_sites_to_crawl is -1 for example then crawl away.
95
- if max_sites_to_crawl < 0
96
- true
97
- else
98
- loop_count < max_sites_to_crawl
99
- end
100
- end
101
-
102
- # The unique url index on the documents collection prevents duplicate
103
- # inserts.
104
- def write_doc_to_db(doc)
105
- db.insert(doc)
106
- puts "Saved document for url: #{doc.url}"
107
- true
108
- rescue Mongo::Error::OperationFailure
109
- puts "Document already exists: #{doc.url}"
110
- false
111
- end
112
-
113
- # The unique url index on the urls collection prevents duplicate inserts.
114
- def write_urls_to_db(urls)
115
- count = 0
116
- if urls.respond_to?(:each)
117
- urls.each do |url|
118
- begin
119
- db.insert(url)
120
- count += 1
121
- puts "Inserted url: #{url}"
122
- rescue Mongo::Error::OperationFailure
123
- puts "Url already exists: #{url}"
124
- end
125
- end
126
- end
127
- count
128
- end
129
- end
130
- end
131
-
132
- if __FILE__ == $0
133
- Wgit.crawl_the_web
134
- end