wgit 0.8.0 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/url.rb CHANGED
@@ -6,15 +6,15 @@ require 'uri'
6
6
  require 'addressable/uri'
7
7
 
8
8
  module Wgit
9
- # Class modeling a web based HTTP URL.
9
+ # Class modeling/serialising a web based HTTP URL.
10
10
  #
11
11
  # Can be an internal/relative link e.g. "about.html" or an absolute URL
12
- # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
13
- # 'addressable/uri' internally.
12
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
13
+ # `addressable/uri` internally for parsing.
14
14
  #
15
- # Most of the methods in this class return new Wgit::Url instances making the
16
- # method calls chainable e.g. url.omit_base.omit_fragment etc. The methods
17
- # also try to be idempotent where possible.
15
+ # Most of the methods in this class return new `Wgit::Url` instances making
16
+ # the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
17
+ # methods also try to be idempotent where possible.
18
18
  class Url < String
19
19
  include Assertable
20
20
 
@@ -28,7 +28,7 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
- # Initializes a new instance of Wgit::Url which represents a web based
31
+ # Initializes a new instance of Wgit::Url which models a web based
32
32
  # HTTP URL.
33
33
  #
34
34
  # @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
@@ -99,10 +99,10 @@ module Wgit
99
99
  # @param obj [Object] The object to parse, which #is_a?(String).
100
100
  # @raise [StandardError] If obj.is_a?(String) is false.
101
101
  # @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
102
- def self.parse_or_nil(obj)
102
+ def self.parse?(obj)
103
103
  parse(obj)
104
104
  rescue Addressable::URI::InvalidURIError
105
- Wgit.logger.debug("Wgit::Url.parse_or_nil('#{obj}') exception: \
105
+ Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
106
106
  Addressable::URI::InvalidURIError")
107
107
  nil
108
108
  end
@@ -115,8 +115,6 @@ Addressable::URI::InvalidURIError")
115
115
  def crawled=(bool)
116
116
  @crawled = bool
117
117
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
118
-
119
- bool
120
118
  end
121
119
 
122
120
  # Overrides String#replace setting the new_url @uri and String value.
@@ -146,10 +144,10 @@ Addressable::URI::InvalidURIError")
146
144
  # @param opts [Hash] The options with which to check relativity. Only one
147
145
  # opts param should be provided. The provided opts param Url must be
148
146
  # absolute and be prefixed with a scheme. Consider using the output of
149
- # Wgit::Url#to_base which should work (unless it's nil).
150
- # @option opts [Wgit::Url, String] :base The Url base e.g.
151
- # http://www.google.com/how which gives a base of
152
- # 'http://www.google.com'.
147
+ # Wgit::Url#to_origin which should work (unless it's nil).
148
+ # @option opts [Wgit::Url, String] :origin The Url origin e.g.
149
+ # http://www.google.com:81/how which gives a origin of
150
+ # 'http://www.google.com:81'.
153
151
  # @option opts [Wgit::Url, String] :host The Url host e.g.
154
152
  # http://www.google.com/how which gives a host of 'www.google.com'.
155
153
  # @option opts [Wgit::Url, String] :domain The Url domain e.g.
@@ -160,10 +158,11 @@ Addressable::URI::InvalidURIError")
160
158
  # param has been provided.
161
159
  # @return [Boolean] True if relative, false if absolute.
162
160
  def relative?(opts = {})
163
- defaults = { base: nil, host: nil, domain: nil, brand: nil }
161
+ defaults = { origin: nil, host: nil, domain: nil, brand: nil }
164
162
  opts = defaults.merge(opts)
165
163
  raise 'Url (self) cannot be empty' if empty?
166
164
 
165
+ return false if scheme_relative?
167
166
  return true if @uri.relative?
168
167
 
169
168
  # Self is absolute but may be relative to the opts param e.g. host.
@@ -180,8 +179,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
180
179
  end
181
180
 
182
181
  case type
183
- when :base # http://www.google.com
184
- to_base == url.to_base
182
+ when :origin # http://www.google.com:81
183
+ to_origin == url.to_origin
185
184
  when :host # www.google.com
186
185
  to_host == url.to_host
187
186
  when :domain # google.com
@@ -206,8 +205,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
206
205
  # @return [Boolean] True if valid, absolute and crawable, otherwise false.
207
206
  def valid?
208
207
  return false if relative?
209
- return false unless to_base && to_domain
210
- return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
208
+ return false unless to_origin && to_domain
209
+ return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
211
210
 
212
211
  true
213
212
  end
@@ -238,7 +237,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
238
237
  Wgit::Url.new(concatted)
239
238
  end
240
239
 
241
- # Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
240
+ # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
241
+ # This should be used before GET'ing the url, in case it has IRI chars.
242
242
  #
243
243
  # @return [Wgit::Url] An escaped version of self.
244
244
  def normalize
@@ -249,8 +249,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
249
249
  # modify the receiver.
250
250
  #
251
251
  # If self is absolute then it's returned as is, making this method
252
- # idempotent. The doc's <base> element is used if present, otherwise
253
- # doc.url is used as the base; which is concatted with self.
252
+ # idempotent. The doc's `<base>` element is used if present, otherwise
253
+ # `doc.url` is used as the base; which is concatted with self.
254
254
  #
255
255
  # Typically used to build an absolute link obtained from a document.
256
256
  #
@@ -258,35 +258,37 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
258
258
  # link = Wgit::Url.new('/favicon.png')
259
259
  # doc = Wgit::Document.new('http://example.com')
260
260
  #
261
- # link.prefix_base(doc) # => "http://example.com/favicon.png"
261
+ # link.make_absolute(doc) # => "http://example.com/favicon.png"
262
262
  #
263
263
  # @param doc [Wgit::Document] The doc whose base Url is concatted with
264
264
  # self.
265
265
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
266
266
  # raises an Exception.
267
267
  # @return [Wgit::Url] Self in absolute form.
268
- def prefix_base(doc)
268
+ def make_absolute(doc)
269
269
  assert_type(doc, Wgit::Document)
270
+ raise 'Cannot make absolute when Document @url is not valid' \
271
+ unless doc.url.valid?
272
+
273
+ return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
270
274
 
271
275
  absolute? ? self : doc.base_url(link: self).concat(self)
272
276
  end
273
277
 
274
- # Returns self having prefixed a protocol scheme. Doesn't modify receiver.
278
+ # Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
275
279
  # Returns self even if absolute (with scheme); therefore is idempotent.
276
280
  #
277
- # @param protocol [Symbol] Either :http or :https.
278
- # @return [Wgit::Url] Self with a protocol scheme prefix.
279
- def prefix_scheme(protocol: :http)
280
- return self if absolute?
281
-
282
- case protocol
283
- when :http
284
- Wgit::Url.new("http://#{url}")
285
- when :https
286
- Wgit::Url.new("https://#{url}")
287
- else
288
- raise "protocol must be :http or :https, not :#{protocol}"
281
+ # @param scheme [Symbol] Either :http or :https.
282
+ # @return [Wgit::Url] Self with a scheme prefix.
283
+ def prefix_scheme(scheme = :http)
284
+ unless %i[http https].include?(scheme)
285
+ raise "scheme must be :http or :https, not :#{scheme}"
289
286
  end
287
+
288
+ return self if absolute? && !scheme_relative?
289
+
290
+ separator = scheme_relative? ? '' : '//'
291
+ Wgit::Url.new("#{scheme}:#{separator}#{self}")
290
292
  end
291
293
 
292
294
  # Returns a Hash containing this Url's instance vars excluding @uri.
@@ -294,8 +296,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
294
296
  #
295
297
  # @return [Hash] self's instance vars as a Hash.
296
298
  def to_h
297
- ignore = ['@uri']
298
- h = Wgit::Utils.to_h(self, ignore: ignore)
299
+ h = Wgit::Utils.to_h(self, ignore: ['@uri'])
299
300
  Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
300
301
  end
301
302
 
@@ -338,6 +339,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
338
339
  host ? Wgit::Url.new(host) : nil
339
340
  end
340
341
 
342
+ # Returns a new Wgit::Url containing just the port of this URL e.g.
343
+ # Given http://www.google.co.uk:443/about.html, '443' is returned.
344
+ #
345
+ # @return [Wgit::Url, nil] Containing just the port or nil.
346
+ def to_port
347
+ port = @uri.port
348
+
349
+ # @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
350
+ return nil unless port
351
+ return nil unless include?(":#{port}")
352
+
353
+ Wgit::Url.new(port.to_s)
354
+ end
355
+
341
356
  # Returns a new Wgit::Url containing just the domain of this URL e.g.
342
357
  # Given http://www.google.co.uk/about.html, google.co.uk is returned.
343
358
  #
@@ -347,6 +362,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
347
362
  domain ? Wgit::Url.new(domain) : nil
348
363
  end
349
364
 
365
+ # Returns a new Wgit::Url containing just the sub domain of this URL e.g.
366
+ # Given http://scripts.dev.google.com, scripts.dev is returned.
367
+ #
368
+ # @return [Wgit::Url, nil] Containing just the sub domain or nil.
369
+ def to_sub_domain
370
+ return nil unless to_host
371
+
372
+ dot_domain = ".#{to_domain}"
373
+ return nil unless include?(dot_domain)
374
+
375
+ sub_domain = to_host.sub(dot_domain, '')
376
+ Wgit::Url.new(sub_domain)
377
+ end
378
+
350
379
  # Returns a new Wgit::Url containing just the brand of this URL e.g.
351
380
  # Given http://www.google.co.uk/about.html, google is returned.
352
381
  #
@@ -362,12 +391,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
362
391
  # @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
363
392
  # nil.
364
393
  def to_base
365
- return nil if @uri.scheme.nil? || @uri.host.nil?
394
+ return nil unless @uri.scheme && @uri.host
366
395
 
367
396
  base = "#{@uri.scheme}://#{@uri.host}"
368
397
  Wgit::Url.new(base)
369
398
  end
370
399
 
400
+ # Returns only the origin of this URL e.g. the protocol scheme, host and
401
+ # port combined. For http://localhost:3000/api, http://localhost:3000 gets
402
+ # returned. If there's no port present, then to_base is returned.
403
+ #
404
+ # @return [Wgit::Url, nil] The origin of self or nil.
405
+ def to_origin
406
+ return nil unless to_base
407
+ return to_base unless to_port
408
+
409
+ Wgit::Url.new("#{to_base}:#{to_port}")
410
+ end
411
+
371
412
  # Returns the path of this URL e.g. the bit after the host without slashes.
372
413
  # For example:
373
414
  # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
@@ -396,7 +437,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
396
437
  end
397
438
 
398
439
  # Returns a new Wgit::Url containing just the query string of this URL
399
- # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
440
+ # e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
400
441
  #
401
442
  # @return [Wgit::Url, nil] Containing just the query string or nil.
402
443
  def to_query
@@ -404,6 +445,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
404
445
  query ? Wgit::Url.new(query) : nil
405
446
  end
406
447
 
448
+ # Returns a Hash containing just the query string parameters of this URL
449
+ # e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
450
+ #
451
+ # @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
452
+ # true, Strings otherwise.
453
+ # @return [Hash<String | Symbol, String>] Containing the query string
454
+ # params or empty if the URL doesn't contain any query parameters.
455
+ def to_query_hash(symbolize_keys: false)
456
+ query_str = to_query
457
+ return {} unless query_str
458
+
459
+ query_str.split('&').each_with_object({}) do |param, hash|
460
+ k, v = param.split('=')
461
+ k = k.to_sym if symbolize_keys
462
+ hash[k] = v
463
+ end
464
+ end
465
+
407
466
  # Returns a new Wgit::Url containing just the fragment string of this URL
408
467
  # e.g. Given http://google.com#about, #about is returned.
409
468
  #
@@ -425,6 +484,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
425
484
  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
426
485
  end
427
486
 
487
+ # Returns a new Wgit::Url containing just the username string of this URL
488
+ # e.g. Given http://me:pass1@example.com, me is returned.
489
+ #
490
+ # @return [Wgit::Url, nil] Containing just the user string or nil.
491
+ def to_user
492
+ user = @uri.user
493
+ user ? Wgit::Url.new(user) : nil
494
+ end
495
+
496
+ # Returns a new Wgit::Url containing just the password string of this URL
497
+ # e.g. Given http://me:pass1@example.com, pass1 is returned.
498
+ #
499
+ # @return [Wgit::Url, nil] Containing just the password string or nil.
500
+ def to_password
501
+ password = @uri.password
502
+ password ? Wgit::Url.new(password) : nil
503
+ end
504
+
428
505
  # Omits the given URL components from self and returns a new Wgit::Url.
429
506
  #
430
507
  # Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
@@ -468,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
468
545
  .omit_trailing_slash
469
546
  end
470
547
 
471
- # Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
548
+ # Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
472
549
  # http://google.com/search?q=something#about, search?q=something#about is
473
550
  # returned. If relative and base isn't present then self is returned.
474
551
  # Leading and trailing slashes are always stripped from the return value.
@@ -483,6 +560,21 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
483
560
  Wgit::Url.new(omit_base).omit_slashes
484
561
  end
485
562
 
563
+ # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
564
+ # http://google.com:81/search?q=something#about, search?q=something#about is
565
+ # returned. If relative and base isn't present then self is returned.
566
+ # Leading and trailing slashes are always stripped from the return value.
567
+ #
568
+ # @return [Wgit::Url] Self containing everything after the origin.
569
+ def omit_origin
570
+ origin = to_origin
571
+ omit_origin = origin ? gsub(origin, '') : self
572
+
573
+ return self if ['', '/'].include?(omit_origin)
574
+
575
+ Wgit::Url.new(omit_origin).omit_slashes
576
+ end
577
+
486
578
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
487
579
  # http://google.com/search?q=hello, http://google.com/search is
488
580
  # returned. Self is returned as is if no query string is present. A URL
@@ -528,25 +620,47 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
528
620
  start_with?('#')
529
621
  end
530
622
 
531
- alias + concat
532
- alias crawled? crawled
533
- alias normalise normalize
534
- alias is_relative? relative?
535
- alias is_absolute? absolute?
536
- alias is_valid? valid?
537
- alias is_query? query?
538
- alias is_fragment? fragment?
539
- alias uri to_uri
540
- alias url to_url
541
- alias scheme to_scheme
542
- alias host to_host
543
- alias domain to_domain
544
- alias brand to_brand
545
- alias base to_base
546
- alias path to_path
547
- alias endpoint to_endpoint
548
- alias query to_query
549
- alias fragment to_fragment
550
- alias extension to_extension
623
+ # Returns true if self equals '/' a.k.a. index.
624
+ #
625
+ # @return [Boolean] True if self equals '/', false otherwise.
626
+ def index?
627
+ self == '/'
628
+ end
629
+
630
+ # Returns true if self starts with '//' a.k.a a scheme/protocol relative
631
+ # path.
632
+ #
633
+ # @return [Boolean] True if self starts with '//', false otherwise.
634
+ def scheme_relative?
635
+ start_with?('//')
636
+ end
637
+
638
+ alias + concat
639
+ alias crawled? crawled
640
+ alias is_relative? relative?
641
+ alias is_absolute? absolute?
642
+ alias is_valid? valid?
643
+ alias is_query? query?
644
+ alias is_fragment? fragment?
645
+ alias is_index? index?
646
+ alias is_scheme_relative? scheme_relative?
647
+ alias uri to_uri
648
+ alias url to_url
649
+ alias scheme to_scheme
650
+ alias host to_host
651
+ alias port to_port
652
+ alias domain to_domain
653
+ alias brand to_brand
654
+ alias base to_base
655
+ alias origin to_origin
656
+ alias path to_path
657
+ alias endpoint to_endpoint
658
+ alias query to_query
659
+ alias query_hash to_query_hash
660
+ alias fragment to_fragment
661
+ alias extension to_extension
662
+ alias user to_user
663
+ alias password to_password
664
+ alias sub_domain to_sub_domain
551
665
  end
552
666
  end
data/lib/wgit/utils.rb CHANGED
@@ -145,7 +145,8 @@ module Wgit
145
145
  # @param keyword_limit [Integer] The max amount of keywords to be
146
146
  # outputted to the stream.
147
147
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
- # to output text somewhere e.g. a file or STDOUT.
148
+ # to output text somewhere e.g. a file or STDERR.
149
+ # @return [Integer] The number of results.
149
150
  def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
150
151
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
151
152
 
@@ -162,18 +163,37 @@ module Wgit
162
163
  stream.puts
163
164
  end
164
165
 
165
- nil
166
+ results.size
166
167
  end
167
168
 
168
- # Processes a String to make it uniform. Strips any leading/trailing white
169
+ # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
+ # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
+ # not in the case statement will be ignored and returned as is.
172
+ #
173
+ # @param obj [Object] The object to be sanitized.
174
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
+ # invalid characters.
176
+ # @return [Object] The sanitized obj is both modified and then returned.
177
+ def self.sanitize(obj, encode: true)
178
+ case obj
179
+ when String
180
+ sanitize_str(obj, encode: encode)
181
+ when Array
182
+ sanitize_arr(obj, encode: encode)
183
+ else
184
+ obj
185
+ end
186
+ end
187
+
188
+ # Sanitises a String to make it uniform. Strips any leading/trailing white
169
189
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
170
190
  # `encode: true`.
171
191
  #
172
- # @param str [String] The String to process. str is modified.
192
+ # @param str [String] The String to sanitize. str is modified.
173
193
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
174
194
  # invalid characters.
175
- # @return [String] The processed str is both modified and then returned.
176
- def self.process_str(str, encode: true)
195
+ # @return [String] The sanitized str is both modified and then returned.
196
+ def self.sanitize_str(str, encode: true)
177
197
  if str.is_a?(String)
178
198
  str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
179
199
  str.strip!
@@ -182,15 +202,15 @@ module Wgit
182
202
  str
183
203
  end
184
204
 
185
- # Processes an Array to make it uniform. Removes empty Strings and nils,
186
- # processes non empty Strings using Wgit::Utils.process_str and removes
205
+ # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
+ # processes non empty Strings using Wgit::Utils.sanitize and removes
187
207
  # duplicates.
188
208
  #
189
- # @param arr [Enumerable] The Array to process. arr is modified.
190
- # @return [Enumerable] The processed arr is both modified and then returned.
191
- def self.process_arr(arr, encode: true)
209
+ # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
+ # @return [Enumerable] The sanitized arr is both modified and then returned.
211
+ def self.sanitize_arr(arr, encode: true)
192
212
  if arr.is_a?(Array)
193
- arr.map! { |str| process_str(str, encode: encode) }
213
+ arr.map! { |str| sanitize(str, encode: encode) }
194
214
  arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
195
215
  arr.compact!
196
216
  arr.uniq!
@@ -198,13 +218,5 @@ module Wgit
198
218
 
199
219
  arr
200
220
  end
201
-
202
- # Returns the model having removed non bson types (for use with MongoDB).
203
- #
204
- # @param model_hash [Hash] The model Hash to process.
205
- # @return [Hash] The model Hash with non bson types removed.
206
- def self.remove_non_bson_types(model_hash)
207
- model_hash.select { |_k, v| v.respond_to?(:bson_type) }
208
- end
209
221
  end
210
222
  end
data/lib/wgit/version.rb CHANGED
@@ -2,10 +2,11 @@
2
2
 
3
3
  # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
4
4
  # contents for later use.
5
+ #
5
6
  # @author Michael Telford
6
7
  module Wgit
7
8
  # The current gem version of Wgit.
8
- VERSION = '0.8.0'
9
+ VERSION = '0.10.2'
9
10
 
10
11
  # Returns the current gem version of Wgit as a String.
11
12
  def self.version
data/lib/wgit.rb CHANGED
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
6
6
  require_relative 'wgit/utils'
7
7
  require_relative 'wgit/url'
8
8
  require_relative 'wgit/document'
9
- require_relative 'wgit/document_extensions'
9
+ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
13
  require_relative 'wgit/indexer'
14
+ require_relative 'wgit/dsl'
15
+ require_relative 'wgit/base'
14
16
  # require_relative 'wgit/core_ext' - Must be explicitly required.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.10.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-27 00:00:00.000000000 Z
11
+ date: 2021-11-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '1.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ferrum
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.8'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.8'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: byebug
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -184,14 +198,10 @@ dependencies:
184
198
  - - "<"
185
199
  - !ruby/object:Gem::Version
186
200
  version: '1.0'
187
- description: 'Fundamentally, Wgit is a HTTP indexer/scraper which crawls URL''s to
188
- retrieve and serialise their page contents for later use. You can use Wgit to scrape
189
- entire websites if required. Wgit also provides a means to search indexed documents
190
- stored in a database. Therefore, this library provides the main components of a
191
- WWW search engine. The Wgit API is easily extended allowing you to pull out the
192
- parts of a webpage that are important to you, the code snippets or tables for example.
193
- As Wgit is a library, it supports many different use cases including data mining,
194
- analytics, web indexing and URL parsing to name a few.
201
+ description: 'Wgit was primarily designed to crawl static HTML websites to index and
202
+ search their content - providing the basis of any search engine; but Wgit is suitable
203
+ for many application domains including: URL parsing, data mining and statistical
204
+ analysis.
195
205
 
196
206
  '
197
207
  email: michael.telford@live.com
@@ -202,12 +212,14 @@ extra_rdoc_files: []
202
212
  files:
203
213
  - "./lib/wgit.rb"
204
214
  - "./lib/wgit/assertable.rb"
215
+ - "./lib/wgit/base.rb"
205
216
  - "./lib/wgit/core_ext.rb"
206
217
  - "./lib/wgit/crawler.rb"
207
218
  - "./lib/wgit/database/database.rb"
208
219
  - "./lib/wgit/database/model.rb"
209
220
  - "./lib/wgit/document.rb"
210
- - "./lib/wgit/document_extensions.rb"
221
+ - "./lib/wgit/document_extractors.rb"
222
+ - "./lib/wgit/dsl.rb"
211
223
  - "./lib/wgit/indexer.rb"
212
224
  - "./lib/wgit/logger.rb"
213
225
  - "./lib/wgit/response.rb"
@@ -229,7 +241,7 @@ metadata:
229
241
  source_code_uri: https://github.com/michaeltelford/wgit
230
242
  changelog_uri: https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md
231
243
  bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
232
- documentation_uri: https://www.rubydoc.info/github/michaeltelford/wgit/master
244
+ documentation_uri: https://www.rubydoc.info/gems/wgit
233
245
  allowed_push_host: https://rubygems.org
234
246
  post_install_message: Added the 'wgit' executable to $PATH
235
247
  rdoc_options: []
@@ -237,18 +249,21 @@ require_paths:
237
249
  - lib
238
250
  required_ruby_version: !ruby/object:Gem::Requirement
239
251
  requirements:
240
- - - "~>"
252
+ - - ">="
253
+ - !ruby/object:Gem::Version
254
+ version: '2.6'
255
+ - - "<"
241
256
  - !ruby/object:Gem::Version
242
- version: '2.5'
257
+ version: '4'
243
258
  required_rubygems_version: !ruby/object:Gem::Requirement
244
259
  requirements:
245
260
  - - ">="
246
261
  - !ruby/object:Gem::Version
247
262
  version: '0'
248
263
  requirements: []
249
- rubygems_version: 3.0.6
250
- signing_key:
264
+ rubygems_version: 3.2.22
265
+ signing_key:
251
266
  specification_version: 4
252
- summary: Wgit is a Ruby library primarily used for crawling, indexing and searching
253
- HTML webpages.
267
+ summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
268
+ extract the data you want from the web.
254
269
  test_files: []