wgit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6956381fcc74e20521f0e219cbfaaa74da79de5bdb24349c2fdf4643ca384a31
4
- data.tar.gz: a544446aa9333d2001119df37ca929cdf2585f89ed084071e077c460b4ff24c9
3
+ metadata.gz: d9d045d2dd7f570db1811bafab1ac244103cc359033efd9279323c795a67bb9f
4
+ data.tar.gz: 996801763a6576ede812e2edd7d201ceb34b2135548a365b748f953e7df40db9
5
5
  SHA512:
6
- metadata.gz: 517665017a25419d9213df10347cd704a98ee0061243ebcd8d482465461a16d5b8319971321703b663ec8d6ef8f453d60d771d2122590b1655a6fc08be461026
7
- data.tar.gz: 760e1c8b1b5cf385dfb1d0418c3b416cdef7a9e02595b1f729a30179848145cdc3c4fa25e2bacf073779baba9909b20ef9f2c5038c8b9df1437f0ade81e05990
6
+ metadata.gz: e0dfe907c599c320377464aec927b24700d0e9e17d7bb37b4903715af63cbf031dc5983cd6749b1d90353cbcffc0d71e76eb2a0f8c0ba77b3b03f2d51ca9634f
7
+ data.tar.gz: bade693ab5b32bf8a16747233356307fe489798855133b8e16b3a907d38f8fd9ecfadadab0273d1c0767106bcf85027c64ebb6f86dc38660240200d5fef07377
data/lib/wgit/crawler.rb CHANGED
@@ -5,6 +5,7 @@ require_relative 'document'
5
5
  require_relative 'utils'
6
6
  require_relative 'assertable'
7
7
  require 'net/http' # Requires 'uri'.
8
+ require 'benchmark'
8
9
 
9
10
  module Wgit
10
11
  # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
@@ -105,7 +106,7 @@ module Wgit
105
106
  # Crawl the url returning the response Wgit::Document or nil if an error
106
107
  # occurs.
107
108
  #
108
- # @param url [Wgit::Url] The Url to crawl.
109
+ # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
109
110
  # @param follow_external_redirects [Boolean] Whether or not to follow
110
111
  # an external redirect. External meaning to a different host. False will
111
112
  # return nil for such a crawl. If false, you must also provide a `host:`
@@ -123,16 +124,14 @@ module Wgit
123
124
  # A String url isn't allowed because it's passed by value not reference,
124
125
  # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
125
126
  assert_type(url, Wgit::Url)
127
+ raise 'host cannot be nil if follow_external_redirects is false' \
126
128
  if !follow_external_redirects && host.nil?
127
- raise 'host cannot be nil if follow_external_redirects is false'
128
- end
129
129
 
130
130
  html = fetch(
131
131
  url,
132
132
  follow_external_redirects: follow_external_redirects,
133
133
  host: host
134
134
  )
135
- url.crawled = true
136
135
 
137
136
  doc = Wgit::Document.new(url, html)
138
137
  yield(doc) if block_given?
@@ -147,7 +146,8 @@ module Wgit
147
146
  # HTTP response that doesn't return a HTML body will be ignored and nil
148
147
  # will be returned; otherwise, the HTML String is returned.
149
148
  #
150
- # @param url [Wgit::Url] The URL to fetch the HTML for.
149
+ # @param url [Wgit::Url] The URL to fetch the HTML for. This Url object
150
+ # will likely be modified as a result of the fetch/crawl.
151
151
  # @param follow_external_redirects [Boolean] Whether or not to follow
152
152
  # an external redirect. False will return nil for such a crawl. If false,
153
153
  # you must also provide a `host:` parameter.
@@ -159,19 +159,26 @@ module Wgit
159
159
  # @return [String, nil] The crawled HTML or nil if the crawl was
160
160
  # unsuccessful.
161
161
  def fetch(url, follow_external_redirects: true, host: nil)
162
- response = resolve(
163
- url,
164
- follow_external_redirects: follow_external_redirects,
165
- host: host
166
- )
167
- @last_response = response
162
+ crawl_duration = nil
163
+ response = nil
164
+
165
+ crawl_duration = Benchmark.measure do
166
+ response = resolve(
167
+ url,
168
+ follow_external_redirects: follow_external_redirects,
169
+ host: host
170
+ )
171
+ end.real
168
172
 
169
173
  response.body.empty? ? nil : response.body
170
174
  rescue StandardError => e
171
175
  Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
172
- @last_response = nil
173
176
 
174
177
  nil
178
+ ensure
179
+ url.crawled = true # Also sets date_crawled underneath.
180
+ url.crawl_duration = crawl_duration
181
+ @last_response = response
175
182
  end
176
183
 
177
184
  # The resolve method performs a HTTP GET to obtain the HTML response. The
@@ -204,10 +211,9 @@ module Wgit
204
211
 
205
212
  yield(url, response, location) if block_given?
206
213
 
214
+ raise "External redirect not allowed - Redirected to: \
215
+ '#{location}', which is outside of host: '#{host}'" \
207
216
  if !follow_external_redirects && !location.is_relative?(host: host)
208
- raise "External redirect not allowed - Redirected to: \
209
- '#{location}', which is outside of host: '#{host}'"
210
- end
211
217
 
212
218
  raise "Too many redirects: #{redirect_count}" \
213
219
  if redirect_count >= @redirect_limit
@@ -235,7 +241,7 @@ module Wgit
235
241
  .uniq
236
242
  .reject do |link|
237
243
  ext = link.to_extension
238
- ext ? !%w[htm html].include?(ext) : false
244
+ ext ? !%w[htm html].include?(ext.downcase) : false
239
245
  end
240
246
  end
241
247
 
@@ -64,7 +64,9 @@ module Wgit
64
64
  # Wgit::Model.document.
65
65
  # @raise [StandardError] If data isn't valid.
66
66
  def insert(data)
67
+ data = data.dup # Avoid modifying by reference.
67
68
  type = data.is_a?(Enumerable) ? data.first : data
69
+
68
70
  case type
69
71
  when Wgit::Url
70
72
  insert_urls(data)
@@ -146,7 +148,7 @@ module Wgit
146
148
  def search(
147
149
  query, case_sensitive: false, whole_sentence: false, limit: 10, skip: 0
148
150
  )
149
- query.strip!
151
+ query = query.to_s.strip
150
152
  query.replace('"' + query + '"') if whole_sentence
151
153
 
152
154
  # Sort based on the most search hits (aka "textScore").
@@ -232,6 +234,8 @@ module Wgit
232
234
  # @param data [Wgit::Url, Wgit::Document] The data to update.
233
235
  # @raise [StandardError] If the data is not valid.
234
236
  def update(data)
237
+ data = data.dup # Avoid modifying by reference.
238
+
235
239
  case data
236
240
  when Wgit::Url
237
241
  update_url(data)
@@ -3,43 +3,47 @@
3
3
  require_relative '../utils'
4
4
 
5
5
  module Wgit
6
- # Module used to build the database collection objects.
6
+ # Module used to build the database collection objects, forming a data model.
7
7
  module Model
8
- # The data model for a Wgit::Url.
8
+ # The data model for a Wgit::Url collection object and for an embedded
9
+ # 'url' inside a Wgit::Document collection object.
9
10
  #
10
- # @param url [Wgit::Url] The Url DB record.
11
+ # @param url [Wgit::Url] The Url data object.
11
12
  # @return [Hash] The URL model ready for DB insertion.
12
13
  def self.url(url)
13
14
  raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
14
15
 
15
16
  model = url.to_h
17
+
16
18
  Wgit::Utils.remove_non_bson_types(model)
17
19
  end
18
20
 
19
- # The data model for a Wgit::Document.
21
+ # The data model for a Wgit::Document collection object.
20
22
  #
21
- # @param doc [Wgit::Document] The Document DB record.
23
+ # @param doc [Wgit::Document] The Document data object.
22
24
  # @return [Hash] The Document model ready for DB insertion.
23
25
  def self.document(doc)
24
26
  raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
25
27
 
26
- model = doc.to_h(include_html: false)
28
+ model = doc.to_h(include_html: false, include_score: false)
29
+ model['url'] = self.url(doc.url) # Expand Url String into full object.
30
+
27
31
  Wgit::Utils.remove_non_bson_types(model)
28
32
  end
29
33
 
30
34
  # Common fields when inserting a record into the DB.
31
35
  #
32
- # @return [Hash] Containing common insertion fields for all models.
36
+ # @return [Hash] Insertion fields common to all models.
33
37
  def self.common_insert_data
34
38
  {
35
- date_added: Wgit::Utils.time_stamp,
39
+ date_added: Wgit::Utils.time_stamp,
36
40
  date_modified: Wgit::Utils.time_stamp
37
41
  }
38
42
  end
39
43
 
40
44
  # Common fields when updating a record in the DB.
41
45
  #
42
- # @return [Hash] Containing common update fields for all models.
46
+ # @return [Hash] Update fields common to all models.
43
47
  def self.common_update_data
44
48
  {
45
49
  date_modified: Wgit::Utils.time_stamp
data/lib/wgit/document.rb CHANGED
@@ -87,7 +87,7 @@ module Wgit
87
87
  xpath
88
88
  end
89
89
 
90
- # Defines an extension, which is a way to extract HTML elements into
90
+ # Defines an extension, which is a way to serialise HTML elements into
91
91
  # instance variables upon Document initialization. See the default
92
92
  # extensions defined in 'document_extensions.rb' as examples.
93
93
  #
@@ -189,13 +189,20 @@ module Wgit
189
189
  @html[range]
190
190
  end
191
191
 
192
- # Returns the timestamp of when this Wgit::Document was crawled.
192
+ # Returns the timestamp of when this Document was crawled.
193
193
  #
194
- # @return [Time] Time of when this Wgit::Document was crawled.
194
+ # @return [Time] Time of when this Document was crawled.
195
195
  def date_crawled
196
196
  @url.date_crawled
197
197
  end
198
198
 
199
+ # Returns the duration of the crawl for this Document (in seconds).
200
+ #
201
+ # @return [Float] The duration of the crawl for this Document.
202
+ def crawl_duration
203
+ @url.crawl_duration
204
+ end
205
+
199
206
  # Returns the base URL of this Wgit::Document. The base URL is either the
200
207
  # <base> element's href value or @url (if @base is nil). If @base is
201
208
  # present and relative, then @url.to_base + @base is returned. This method
@@ -240,8 +247,9 @@ module Wgit
240
247
  # @param include_html [Boolean] Whether or not to include @html in the
241
248
  # returned Hash.
242
249
  # @return [Hash] Containing self's instance vars.
243
- def to_h(include_html: false)
250
+ def to_h(include_html: false, include_score: true)
244
251
  ignore = include_html ? [] : ['@html']
252
+ ignore << '@score' unless include_score
245
253
  ignore << '@doc' # Always ignore Nokogiri @doc.
246
254
 
247
255
  Wgit::Utils.to_h(self, ignore: ignore)
@@ -525,7 +533,10 @@ module Wgit
525
533
  assert_types(html, [String, NilClass])
526
534
 
527
535
  # We already know url.is_a?(String) so parse into Url unless already so.
528
- @url = Wgit::Url.parse(url)
536
+ url = Wgit::Url.parse(url)
537
+ url.crawled = true unless url.crawled # Avoid overriding date_crawled.
538
+
539
+ @url = url
529
540
  @html = html || ''
530
541
  @doc = init_nokogiri
531
542
  @score = 0.0
@@ -49,7 +49,7 @@ Wgit::Document.define_extension(
49
49
  singleton: false,
50
50
  text_content_only: true
51
51
  ) do |links|
52
- links&.map! { |link| Wgit::Url.new(link) }
52
+ links.map! { |link| Wgit::Url.new(link) }
53
53
  end
54
54
 
55
55
  # Text.
data/lib/wgit/indexer.rb CHANGED
@@ -12,6 +12,8 @@ module Wgit
12
12
  # external url's to be crawled later on. Logs info on the crawl
13
13
  # using Wgit.logger as it goes along.
14
14
  #
15
+ # @param connection_string [String] The database connection string. Set as
16
+ # nil to use ENV['WGIT_CONNECTION_STRING'].
15
17
  # @param max_sites [Integer] The number of separate and whole
16
18
  # websites to be crawled before the method exits. Defaults to -1 which
17
19
  # means the crawl will occur until manually stopped (Ctrl+C etc).
@@ -19,8 +21,10 @@ module Wgit
19
21
  # scraped from the web (default is 1GB). Note, that this value is used to
20
22
  # determine when to stop crawling; it's not a guarantee of the max data
21
23
  # that will be obtained.
22
- def self.index_www(max_sites: -1, max_data: 1_048_576_000)
23
- db = Wgit::Database.new
24
+ def self.index_www(
25
+ connection_string: nil, max_sites: -1, max_data: 1_048_576_000
26
+ )
27
+ db = Wgit::Database.new(connection_string)
24
28
  indexer = Wgit::Indexer.new(db)
25
29
  indexer.index_www(max_sites: max_sites, max_data: max_data)
26
30
  end
@@ -32,14 +36,18 @@ module Wgit
32
36
  # There is no max download limit so be careful which sites you index.
33
37
  #
34
38
  # @param url [Wgit::Url, String] The base Url of the website to crawl.
39
+ # @param connection_string [String] The database connection string. Set as
40
+ # nil to use ENV['WGIT_CONNECTION_STRING'].
35
41
  # @param insert_externals [Boolean] Whether or not to insert the website's
36
42
  # external Url's into the database.
37
43
  # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
38
44
  # inserted into the database allowing for prior manipulation.
39
45
  # @return [Integer] The total number of pages crawled within the website.
40
- def self.index_site(url, insert_externals: true, &block)
46
+ def self.index_site(
47
+ url, connection_string: nil, insert_externals: true, &block
48
+ )
41
49
  url = Wgit::Url.parse(url)
42
- db = Wgit::Database.new
50
+ db = Wgit::Database.new(connection_string)
43
51
  indexer = Wgit::Indexer.new(db)
44
52
  indexer.index_site(url, insert_externals: insert_externals, &block)
45
53
  end
@@ -51,13 +59,17 @@ module Wgit
51
59
  # There is no max download limit so be careful of large pages.
52
60
  #
53
61
  # @param url [Wgit::Url, String] The Url of the webpage to crawl.
62
+ # @param connection_string [String] The database connection string. Set as
63
+ # nil to use ENV['WGIT_CONNECTION_STRING'].
54
64
  # @param insert_externals [Boolean] Whether or not to insert the website's
55
65
  # external Url's into the database.
56
66
  # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
57
67
  # inserted into the database allowing for prior manipulation.
58
- def self.index_page(url, insert_externals: true, &block)
68
+ def self.index_page(
69
+ url, connection_string: nil, insert_externals: true, &block
70
+ )
59
71
  url = Wgit::Url.parse(url)
60
- db = Wgit::Database.new
72
+ db = Wgit::Database.new(connection_string)
61
73
  indexer = Wgit::Indexer.new(db)
62
74
  indexer.index_page(url, insert_externals: insert_externals, &block)
63
75
  end
@@ -67,6 +79,8 @@ module Wgit
67
79
  # details of how the search works.
68
80
  #
69
81
  # @param query [String] The text query to search with.
82
+ # @param connection_string [String] The database connection string. Set as
83
+ # nil to use ENV['WGIT_CONNECTION_STRING'].
70
84
  # @param case_sensitive [Boolean] Whether character case must match.
71
85
  # @param whole_sentence [Boolean] Whether multiple words should be searched
72
86
  # for separately.
@@ -76,11 +90,20 @@ module Wgit
76
90
  # snippet.
77
91
  # @yield [doc] Given each search result (Wgit::Document) returned from the
78
92
  # database.
79
- def self.indexed_search(query, case_sensitive: false, whole_sentence: false,
80
- limit: 10, skip: 0, sentence_limit: 80, &block)
81
- results = Wgit::Database.new.search(
82
- query, case_sensitive: case_sensitive, whole_sentence: whole_sentence,
83
- limit: limit, skip: skip, &block
93
+ def self.indexed_search(
94
+ query, connection_string: nil,
95
+ case_sensitive: false, whole_sentence: false,
96
+ limit: 10, skip: 0, sentence_limit: 80, &block
97
+ )
98
+ db = Wgit::Database.new(connection_string)
99
+
100
+ results = db.search(
101
+ query,
102
+ case_sensitive: case_sensitive,
103
+ whole_sentence: whole_sentence,
104
+ limit: limit,
105
+ skip: skip,
106
+ &block
84
107
  )
85
108
 
86
109
  results.each do |doc|
@@ -88,7 +111,8 @@ module Wgit
88
111
  query,
89
112
  case_sensitive: case_sensitive,
90
113
  whole_sentence: whole_sentence,
91
- sentence_limit: sentence_limit)
114
+ sentence_limit: sentence_limit
115
+ )
92
116
  end
93
117
 
94
118
  Wgit::Utils.printf_search_results(results)
@@ -137,6 +161,7 @@ runs out of urls to crawl (which might be never).")
137
161
 
138
162
  if uncrawled_urls.empty?
139
163
  Wgit.logger.info('No urls to crawl, exiting.')
164
+
140
165
  return
141
166
  end
142
167
  Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
@@ -148,6 +173,7 @@ runs out of urls to crawl (which might be never).")
148
173
  unless keep_crawling?(site_count, max_sites, max_data)
149
174
  Wgit.logger.info("Reached max number of sites to crawl or \
150
175
  database capacity, exiting.")
176
+
151
177
  return
152
178
  end
153
179
  site_count += 1
data/lib/wgit/url.rb CHANGED
@@ -15,12 +15,15 @@ module Wgit
15
15
  include Assertable
16
16
 
17
17
  # Whether or not the Url has been crawled or not. A custom crawled= method
18
- # is also provided by this class.
19
- attr_reader :crawled
18
+ # is provided by this class, overridding the default one.
19
+ attr_accessor :crawled
20
20
 
21
- # The Time which the Url was crawled.
21
+ # The Time stamp of when this Url was crawled.
22
22
  attr_accessor :date_crawled
23
23
 
24
+ # The duration of the crawl for this Url (in seconds).
25
+ attr_accessor :crawl_duration
26
+
24
27
  # Initializes a new instance of Wgit::Url which represents a web based
25
28
  # HTTP URL.
26
29
  #
@@ -32,8 +35,12 @@ module Wgit
32
35
  # @param date_crawled [Time] Should only be provided if crawled is true. A
33
36
  # suitable object can be returned from Wgit::Utils.time_stamp. Only used
34
37
  # if url_or_obj is a String.
38
+ # @param crawl_duration [Float] Should only be provided if crawled is true.
39
+ # The duration of the crawl for this Url (in seconds).
35
40
  # @raise [StandardError] If url_or_obj is an Object with missing methods.
36
- def initialize(url_or_obj, crawled: false, date_crawled: nil)
41
+ def initialize(
42
+ url_or_obj, crawled: false, date_crawled: nil, crawl_duration: nil
43
+ )
37
44
  # Init from a URL String.
38
45
  if url_or_obj.is_a?(String)
39
46
  url = url_or_obj.to_s
@@ -42,14 +49,16 @@ module Wgit
42
49
  obj = url_or_obj
43
50
  assert_respond_to(obj, :fetch)
44
51
 
45
- url = obj.fetch('url') # Should always be present.
46
- crawled = obj.fetch('crawled', false)
47
- date_crawled = obj.fetch('date_crawled', nil)
52
+ url = obj.fetch('url') # Should always be present.
53
+ crawled = obj.fetch('crawled', false)
54
+ date_crawled = obj.fetch('date_crawled', nil)
55
+ crawl_duration = obj.fetch('crawl_duration', nil)
48
56
  end
49
57
 
50
- @uri = Addressable::URI.parse(url)
51
- @crawled = crawled
52
- @date_crawled = date_crawled
58
+ @uri = Addressable::URI.parse(url)
59
+ @crawled = crawled
60
+ @date_crawled = date_crawled
61
+ @crawl_duration = crawl_duration
53
62
 
54
63
  super(url)
55
64
  end
@@ -77,14 +86,16 @@ module Wgit
77
86
  obj.is_a?(Wgit::Url) ? obj : new(obj)
78
87
  end
79
88
 
80
- # Sets the @crawled instance var, also setting @date_crawled to the
81
- # current time or nil (depending on the bool value).
89
+ # Sets the @crawled instance var, also setting @date_crawled for
90
+ # convenience.
82
91
  #
83
- # @param bool [Boolean] True if self has been crawled, false otherwise.
84
- # @return [Time, NilClass] Returns the date crawled, if set.
92
+ # @param bool [Boolean] True if this Url has been crawled, false otherwise.
93
+ # @return [Boolean] The value of bool having been set.
85
94
  def crawled=(bool)
86
- @crawled = bool
95
+ @crawled = bool
87
96
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
97
+
98
+ bool
88
99
  end
89
100
 
90
101
  # Overrides String#replace setting the new_url @uri and String value.
data/lib/wgit/utils.rb CHANGED
@@ -120,20 +120,19 @@ module Wgit
120
120
  # outputted to the stream.
121
121
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
122
122
  # to output text somewhere e.g. a file or STDOUT.
123
- # @return [NilClass] Returns nil.
124
123
  def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
125
124
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
126
125
 
127
126
  results.each do |doc|
128
- title = (doc.title || '<no title>')
129
- missing_keywords = (doc.keywords.nil? || doc.keywords.empty?)
130
- keywords = missing_keywords ? nil : doc.keywords.take(keyword_limit)
127
+ title = (doc.title || '<no title>')
128
+ keywords = doc.keywords&.take(keyword_limit)&.join(', ')
131
129
  sentence = doc.text.first
130
+ url = doc.url
132
131
 
133
132
  stream.puts title
134
- stream.puts keywords.join(', ') if keywords
135
- stream.puts sentence if sentence
136
- stream.puts doc.url
133
+ stream.puts keywords if keywords
134
+ stream.puts sentence
135
+ stream.puts url
137
136
  stream.puts
138
137
  end
139
138
 
data/lib/wgit/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  # @author Michael Telford
6
6
  module Wgit
7
7
  # The current gem version of Wgit.
8
- VERSION = '0.2.0'
8
+ VERSION = '0.3.0'
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-22 00:00:00.000000000 Z
11
+ date: 2019-10-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable