wgit 0.7.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/url.rb CHANGED
@@ -6,15 +6,15 @@ require 'uri'
6
6
  require 'addressable/uri'
7
7
 
8
8
  module Wgit
9
- # Class modeling a web based HTTP URL.
9
+ # Class modeling/serialising a web based HTTP URL.
10
10
  #
11
11
  # Can be an internal/relative link e.g. "about.html" or an absolute URL
12
- # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
13
- # 'addressable/uri' internally.
12
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
13
+ # `addressable/uri` internally for parsing.
14
14
  #
15
- # Most of the methods in this class return new Wgit::Url instances making the
16
- # method calls chainable e.g. url.omit_base.omit_fragment etc. The methods
17
- # also try to be idempotent where possible.
15
+ # Most of the methods in this class return new `Wgit::Url` instances making
16
+ # the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
17
+ # methods also try to be idempotent where possible.
18
18
  class Url < String
19
19
  include Assertable
20
20
 
@@ -28,7 +28,7 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
- # Initializes a new instance of Wgit::Url which represents a web based
31
+ # Initializes a new instance of Wgit::Url which models a web based
32
32
  # HTTP URL.
33
33
  #
34
34
  # @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
@@ -90,6 +90,23 @@ module Wgit
90
90
  obj.is_a?(Wgit::Url) ? obj : new(obj)
91
91
  end
92
92
 
93
+ # Returns a Wgit::Url instance from Wgit::Url.parse, or nil if obj cannot
94
+ # be parsed successfully e.g. the String is invalid.
95
+ #
96
+ # Use this method when you can't gaurentee that obj is parsable as a URL.
97
+ # See Wgit::Url.parse for more information.
98
+ #
99
+ # @param obj [Object] The object to parse, which #is_a?(String).
100
+ # @raise [StandardError] If obj.is_a?(String) is false.
101
+ # @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
102
+ def self.parse?(obj)
103
+ parse(obj)
104
+ rescue Addressable::URI::InvalidURIError
105
+ Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
106
+ Addressable::URI::InvalidURIError")
107
+ nil
108
+ end
109
+
93
110
  # Sets the @crawled instance var, also setting @date_crawled for
94
111
  # convenience.
95
112
  #
@@ -98,8 +115,6 @@ module Wgit
98
115
  def crawled=(bool)
99
116
  @crawled = bool
100
117
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
101
-
102
- bool
103
118
  end
104
119
 
105
120
  # Overrides String#replace setting the new_url @uri and String value.
@@ -129,10 +144,10 @@ module Wgit
129
144
  # @param opts [Hash] The options with which to check relativity. Only one
130
145
  # opts param should be provided. The provided opts param Url must be
131
146
  # absolute and be prefixed with a scheme. Consider using the output of
132
- # Wgit::Url#to_base which should work (unless it's nil).
133
- # @option opts [Wgit::Url, String] :base The Url base e.g.
134
- # http://www.google.com/how which gives a base of
135
- # 'http://www.google.com'.
147
+ # Wgit::Url#to_origin which should work (unless it's nil).
148
+ # @option opts [Wgit::Url, String] :origin The Url origin e.g.
149
+ # http://www.google.com:81/how which gives a origin of
150
+ # 'http://www.google.com:81'.
136
151
  # @option opts [Wgit::Url, String] :host The Url host e.g.
137
152
  # http://www.google.com/how which gives a host of 'www.google.com'.
138
153
  # @option opts [Wgit::Url, String] :domain The Url domain e.g.
@@ -143,10 +158,11 @@ module Wgit
143
158
  # param has been provided.
144
159
  # @return [Boolean] True if relative, false if absolute.
145
160
  def relative?(opts = {})
146
- defaults = { base: nil, host: nil, domain: nil, brand: nil }
161
+ defaults = { origin: nil, host: nil, domain: nil, brand: nil }
147
162
  opts = defaults.merge(opts)
148
163
  raise 'Url (self) cannot be empty' if empty?
149
164
 
165
+ return false if scheme_relative?
150
166
  return true if @uri.relative?
151
167
 
152
168
  # Self is absolute but may be relative to the opts param e.g. host.
@@ -163,8 +179,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
163
179
  end
164
180
 
165
181
  case type
166
- when :base # http://www.google.com
167
- to_base == url.to_base
182
+ when :origin # http://www.google.com:81
183
+ to_origin == url.to_origin
168
184
  when :host # www.google.com
169
185
  to_host == url.to_host
170
186
  when :domain # google.com
@@ -189,8 +205,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
189
205
  # @return [Boolean] True if valid, absolute and crawable, otherwise false.
190
206
  def valid?
191
207
  return false if relative?
192
- return false unless to_base && to_domain
193
- return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
208
+ return false unless to_origin && to_domain
209
+ return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
194
210
 
195
211
  true
196
212
  end
@@ -221,7 +237,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
221
237
  Wgit::Url.new(concatted)
222
238
  end
223
239
 
224
- # Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
240
+ # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
241
+ # This should be used before GET'ing the url, in case it has IRI chars.
225
242
  #
226
243
  # @return [Wgit::Url] An escaped version of self.
227
244
  def normalize
@@ -232,8 +249,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
232
249
  # modify the receiver.
233
250
  #
234
251
  # If self is absolute then it's returned as is, making this method
235
- # idempotent. The doc's <base> element is used if present, otherwise
236
- # doc.url is used as the base; which is concatted with self.
252
+ # idempotent. The doc's `<base>` element is used if present, otherwise
253
+ # `doc.url` is used as the base; which is concatted with self.
237
254
  #
238
255
  # Typically used to build an absolute link obtained from a document.
239
256
  #
@@ -241,35 +258,37 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
241
258
  # link = Wgit::Url.new('/favicon.png')
242
259
  # doc = Wgit::Document.new('http://example.com')
243
260
  #
244
- # link.prefix_base(doc) # => "http://example.com/favicon.png"
261
+ # link.make_absolute(doc) # => "http://example.com/favicon.png"
245
262
  #
246
263
  # @param doc [Wgit::Document] The doc whose base Url is concatted with
247
264
  # self.
248
265
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
249
266
  # raises an Exception.
250
267
  # @return [Wgit::Url] Self in absolute form.
251
- def prefix_base(doc)
268
+ def make_absolute(doc)
252
269
  assert_type(doc, Wgit::Document)
270
+ raise 'Cannot make absolute when Document @url is not valid' \
271
+ unless doc.url.valid?
272
+
273
+ return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
253
274
 
254
275
  absolute? ? self : doc.base_url(link: self).concat(self)
255
276
  end
256
277
 
257
- # Returns self having prefixed a protocol scheme. Doesn't modify receiver.
278
+ # Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
258
279
  # Returns self even if absolute (with scheme); therefore is idempotent.
259
280
  #
260
- # @param protocol [Symbol] Either :http or :https.
261
- # @return [Wgit::Url] Self with a protocol scheme prefix.
262
- def prefix_scheme(protocol: :http)
263
- return self if absolute?
264
-
265
- case protocol
266
- when :http
267
- Wgit::Url.new("http://#{url}")
268
- when :https
269
- Wgit::Url.new("https://#{url}")
270
- else
271
- raise "protocol must be :http or :https, not :#{protocol}"
281
+ # @param scheme [Symbol] Either :http or :https.
282
+ # @return [Wgit::Url] Self with a scheme prefix.
283
+ def prefix_scheme(scheme = :http)
284
+ unless %i[http https].include?(scheme)
285
+ raise "scheme must be :http or :https, not :#{scheme}"
272
286
  end
287
+
288
+ return self if absolute? && !scheme_relative?
289
+
290
+ separator = scheme_relative? ? '' : '//'
291
+ Wgit::Url.new("#{scheme}:#{separator}#{self}")
273
292
  end
274
293
 
275
294
  # Returns a Hash containing this Url's instance vars excluding @uri.
@@ -277,8 +296,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
277
296
  #
278
297
  # @return [Hash] self's instance vars as a Hash.
279
298
  def to_h
280
- ignore = ['@uri']
281
- h = Wgit::Utils.to_h(self, ignore: ignore)
299
+ h = Wgit::Utils.to_h(self, ignore: ['@uri'])
282
300
  Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
283
301
  end
284
302
 
@@ -321,6 +339,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
321
339
  host ? Wgit::Url.new(host) : nil
322
340
  end
323
341
 
342
+ # Returns a new Wgit::Url containing just the port of this URL e.g.
343
+ # Given http://www.google.co.uk:443/about.html, '443' is returned.
344
+ #
345
+ # @return [Wgit::Url, nil] Containing just the port or nil.
346
+ def to_port
347
+ port = @uri.port
348
+
349
+ # @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
350
+ return nil unless port
351
+ return nil unless include?(":#{port}")
352
+
353
+ Wgit::Url.new(port.to_s)
354
+ end
355
+
324
356
  # Returns a new Wgit::Url containing just the domain of this URL e.g.
325
357
  # Given http://www.google.co.uk/about.html, google.co.uk is returned.
326
358
  #
@@ -330,6 +362,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
330
362
  domain ? Wgit::Url.new(domain) : nil
331
363
  end
332
364
 
365
+ # Returns a new Wgit::Url containing just the sub domain of this URL e.g.
366
+ # Given http://scripts.dev.google.com, scripts.dev is returned.
367
+ #
368
+ # @return [Wgit::Url, nil] Containing just the sub domain or nil.
369
+ def to_sub_domain
370
+ return nil unless to_host
371
+
372
+ dot_domain = ".#{to_domain}"
373
+ return nil unless include?(dot_domain)
374
+
375
+ sub_domain = to_host.sub(dot_domain, '')
376
+ Wgit::Url.new(sub_domain)
377
+ end
378
+
333
379
  # Returns a new Wgit::Url containing just the brand of this URL e.g.
334
380
  # Given http://www.google.co.uk/about.html, google is returned.
335
381
  #
@@ -345,12 +391,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
345
391
  # @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
346
392
  # nil.
347
393
  def to_base
348
- return nil if @uri.scheme.nil? || @uri.host.nil?
394
+ return nil unless @uri.scheme && @uri.host
349
395
 
350
396
  base = "#{@uri.scheme}://#{@uri.host}"
351
397
  Wgit::Url.new(base)
352
398
  end
353
399
 
400
+ # Returns only the origin of this URL e.g. the protocol scheme, host and
401
+ # port combined. For http://localhost:3000/api, http://localhost:3000 gets
402
+ # returned. If there's no port present, then to_base is returned.
403
+ #
404
+ # @return [Wgit::Url, nil] The origin of self or nil.
405
+ def to_origin
406
+ return nil unless to_base
407
+ return to_base unless to_port
408
+
409
+ Wgit::Url.new("#{to_base}:#{to_port}")
410
+ end
411
+
354
412
  # Returns the path of this URL e.g. the bit after the host without slashes.
355
413
  # For example:
356
414
  # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
@@ -379,7 +437,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
379
437
  end
380
438
 
381
439
  # Returns a new Wgit::Url containing just the query string of this URL
382
- # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
440
+ # e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
383
441
  #
384
442
  # @return [Wgit::Url, nil] Containing just the query string or nil.
385
443
  def to_query
@@ -387,6 +445,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
387
445
  query ? Wgit::Url.new(query) : nil
388
446
  end
389
447
 
448
+ # Returns a Hash containing just the query string parameters of this URL
449
+ # e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
450
+ #
451
+ # @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
452
+ # true, Strings otherwise.
453
+ # @return [Hash<String | Symbol, String>] Containing the query string
454
+ # params or empty if the URL doesn't contain any query parameters.
455
+ def to_query_hash(symbolize_keys: false)
456
+ query_str = to_query
457
+ return {} unless query_str
458
+
459
+ query_str.split('&').each_with_object({}) do |param, hash|
460
+ k, v = param.split('=')
461
+ k = k.to_sym if symbolize_keys
462
+ hash[k] = v
463
+ end
464
+ end
465
+
390
466
  # Returns a new Wgit::Url containing just the fragment string of this URL
391
467
  # e.g. Given http://google.com#about, #about is returned.
392
468
  #
@@ -408,6 +484,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
408
484
  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
409
485
  end
410
486
 
487
+ # Returns a new Wgit::Url containing just the username string of this URL
488
+ # e.g. Given http://me:pass1@example.com, me is returned.
489
+ #
490
+ # @return [Wgit::Url, nil] Containing just the user string or nil.
491
+ def to_user
492
+ user = @uri.user
493
+ user ? Wgit::Url.new(user) : nil
494
+ end
495
+
496
+ # Returns a new Wgit::Url containing just the password string of this URL
497
+ # e.g. Given http://me:pass1@example.com, pass1 is returned.
498
+ #
499
+ # @return [Wgit::Url, nil] Containing just the password string or nil.
500
+ def to_password
501
+ password = @uri.password
502
+ password ? Wgit::Url.new(password) : nil
503
+ end
504
+
411
505
  # Omits the given URL components from self and returns a new Wgit::Url.
412
506
  #
413
507
  # Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
@@ -451,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
451
545
  .omit_trailing_slash
452
546
  end
453
547
 
454
- # Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
548
+ # Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
455
549
  # http://google.com/search?q=something#about, search?q=something#about is
456
550
  # returned. If relative and base isn't present then self is returned.
457
551
  # Leading and trailing slashes are always stripped from the return value.
@@ -466,6 +560,21 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
466
560
  Wgit::Url.new(omit_base).omit_slashes
467
561
  end
468
562
 
563
+ # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
564
+ # http://google.com:81/search?q=something#about, search?q=something#about is
565
+ # returned. If relative and base isn't present then self is returned.
566
+ # Leading and trailing slashes are always stripped from the return value.
567
+ #
568
+ # @return [Wgit::Url] Self containing everything after the origin.
569
+ def omit_origin
570
+ origin = to_origin
571
+ omit_origin = origin ? gsub(origin, '') : self
572
+
573
+ return self if ['', '/'].include?(omit_origin)
574
+
575
+ Wgit::Url.new(omit_origin).omit_slashes
576
+ end
577
+
469
578
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
470
579
  # http://google.com/search?q=hello, http://google.com/search is
471
580
  # returned. Self is returned as is if no query string is present. A URL
@@ -511,25 +620,47 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
511
620
  start_with?('#')
512
621
  end
513
622
 
514
- alias + concat
515
- alias crawled? crawled
516
- alias normalise normalize
517
- alias is_relative? relative?
518
- alias is_absolute? absolute?
519
- alias is_valid? valid?
520
- alias is_query? query?
521
- alias is_fragment? fragment?
522
- alias uri to_uri
523
- alias url to_url
524
- alias scheme to_scheme
525
- alias host to_host
526
- alias domain to_domain
527
- alias brand to_brand
528
- alias base to_base
529
- alias path to_path
530
- alias endpoint to_endpoint
531
- alias query to_query
532
- alias fragment to_fragment
533
- alias extension to_extension
623
+ # Returns true if self equals '/' a.k.a. index.
624
+ #
625
+ # @return [Boolean] True if self equals '/', false otherwise.
626
+ def index?
627
+ self == '/'
628
+ end
629
+
630
+ # Returns true if self starts with '//' a.k.a a scheme/protocol relative
631
+ # path.
632
+ #
633
+ # @return [Boolean] True if self starts with '//', false otherwise.
634
+ def scheme_relative?
635
+ start_with?('//')
636
+ end
637
+
638
+ alias + concat
639
+ alias crawled? crawled
640
+ alias is_relative? relative?
641
+ alias is_absolute? absolute?
642
+ alias is_valid? valid?
643
+ alias is_query? query?
644
+ alias is_fragment? fragment?
645
+ alias is_index? index?
646
+ alias is_scheme_relative? scheme_relative?
647
+ alias uri to_uri
648
+ alias url to_url
649
+ alias scheme to_scheme
650
+ alias host to_host
651
+ alias port to_port
652
+ alias domain to_domain
653
+ alias brand to_brand
654
+ alias base to_base
655
+ alias origin to_origin
656
+ alias path to_path
657
+ alias endpoint to_endpoint
658
+ alias query to_query
659
+ alias query_hash to_query_hash
660
+ alias fragment to_fragment
661
+ alias extension to_extension
662
+ alias user to_user
663
+ alias password to_password
664
+ alias sub_domain to_sub_domain
534
665
  end
535
666
  end
data/lib/wgit/utils.rb CHANGED
@@ -145,7 +145,8 @@ module Wgit
145
145
  # @param keyword_limit [Integer] The max amount of keywords to be
146
146
  # outputted to the stream.
147
147
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
- # to output text somewhere e.g. a file or STDOUT.
148
+ # to output text somewhere e.g. a file or STDERR.
149
+ # @return [Integer] The number of results.
149
150
  def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
150
151
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
151
152
 
@@ -162,18 +163,37 @@ module Wgit
162
163
  stream.puts
163
164
  end
164
165
 
165
- nil
166
+ results.size
166
167
  end
167
168
 
168
- # Processes a String to make it uniform. Strips any leading/trailing white
169
+ # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
+ # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
+ # not in the case statement will be ignored and returned as is.
172
+ #
173
+ # @param obj [Object] The object to be sanitized.
174
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
+ # invalid characters.
176
+ # @return [Object] The sanitized obj is both modified and then returned.
177
+ def self.sanitize(obj, encode: true)
178
+ case obj
179
+ when String
180
+ sanitize_str(obj, encode: encode)
181
+ when Array
182
+ sanitize_arr(obj, encode: encode)
183
+ else
184
+ obj
185
+ end
186
+ end
187
+
188
+ # Sanitises a String to make it uniform. Strips any leading/trailing white
169
189
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
170
190
  # `encode: true`.
171
191
  #
172
- # @param str [String] The String to process. str is modified.
192
+ # @param str [String] The String to sanitize. str is modified.
173
193
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
174
194
  # invalid characters.
175
- # @return [String] The processed str is both modified and then returned.
176
- def self.process_str(str, encode: true)
195
+ # @return [String] The sanitized str is both modified and then returned.
196
+ def self.sanitize_str(str, encode: true)
177
197
  if str.is_a?(String)
178
198
  str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
179
199
  str.strip!
@@ -182,15 +202,15 @@ module Wgit
182
202
  str
183
203
  end
184
204
 
185
- # Processes an Array to make it uniform. Removes empty Strings and nils,
186
- # processes non empty Strings using Wgit::Utils.process_str and removes
205
+ # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
+ # processes non empty Strings using Wgit::Utils.sanitize and removes
187
207
  # duplicates.
188
208
  #
189
- # @param arr [Enumerable] The Array to process. arr is modified.
190
- # @return [Enumerable] The processed arr is both modified and then returned.
191
- def self.process_arr(arr, encode: true)
209
+ # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
+ # @return [Enumerable] The sanitized arr is both modified and then returned.
211
+ def self.sanitize_arr(arr, encode: true)
192
212
  if arr.is_a?(Array)
193
- arr.map! { |str| process_str(str, encode: encode) }
213
+ arr.map! { |str| sanitize(str, encode: encode) }
194
214
  arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
195
215
  arr.compact!
196
216
  arr.uniq!
@@ -198,13 +218,5 @@ module Wgit
198
218
 
199
219
  arr
200
220
  end
201
-
202
- # Returns the model having removed non bson types (for use with MongoDB).
203
- #
204
- # @param model_hash [Hash] The model Hash to process.
205
- # @return [Hash] The model Hash with non bson types removed.
206
- def self.remove_non_bson_types(model_hash)
207
- model_hash.select { |_k, v| v.respond_to?(:bson_type) }
208
- end
209
221
  end
210
222
  end
data/lib/wgit/version.rb CHANGED
@@ -2,10 +2,11 @@
2
2
 
3
3
  # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
4
4
  # contents for later use.
5
+ #
5
6
  # @author Michael Telford
6
7
  module Wgit
7
8
  # The current gem version of Wgit.
8
- VERSION = '0.7.0'
9
+ VERSION = '0.10.1'
9
10
 
10
11
  # Returns the current gem version of Wgit as a String.
11
12
  def self.version
data/lib/wgit.rb CHANGED
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
6
6
  require_relative 'wgit/utils'
7
7
  require_relative 'wgit/url'
8
8
  require_relative 'wgit/document'
9
- require_relative 'wgit/document_extensions'
9
+ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
13
  require_relative 'wgit/indexer'
14
+ require_relative 'wgit/dsl'
15
+ require_relative 'wgit/base'
14
16
  # require_relative 'wgit/core_ext' - Must be explicitly required.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.10.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-04 00:00:00.000000000 Z
11
+ date: 2021-11-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '1.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ferrum
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.8'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.8'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: byebug
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -184,14 +198,10 @@ dependencies:
184
198
  - - "<"
185
199
  - !ruby/object:Gem::Version
186
200
  version: '1.0'
187
- description: 'Fundamentally, Wgit is a HTTP indexer/scraper which crawls URL''s to
188
- retrieve and serialise their page contents for later use. You can use Wgit to scrape
189
- entire websites if required. Wgit also provides a means to search indexed documents
190
- stored in a database. Therefore, this library provides the main components of a
191
- WWW search engine. The Wgit API is easily extended allowing you to pull out the
192
- parts of a webpage that are important to you, the code snippets or tables for example.
193
- As Wgit is a library, it supports many different use cases including data mining,
194
- analytics, web indexing and URL parsing to name a few.
201
+ description: 'Wgit was primarily designed to crawl static HTML websites to index and
202
+ search their content - providing the basis of any search engine; but Wgit is suitable
203
+ for many application domains including: URL parsing, data mining and statistical
204
+ analysis.
195
205
 
196
206
  '
197
207
  email: michael.telford@live.com
@@ -202,12 +212,14 @@ extra_rdoc_files: []
202
212
  files:
203
213
  - "./lib/wgit.rb"
204
214
  - "./lib/wgit/assertable.rb"
215
+ - "./lib/wgit/base.rb"
205
216
  - "./lib/wgit/core_ext.rb"
206
217
  - "./lib/wgit/crawler.rb"
207
218
  - "./lib/wgit/database/database.rb"
208
219
  - "./lib/wgit/database/model.rb"
209
220
  - "./lib/wgit/document.rb"
210
- - "./lib/wgit/document_extensions.rb"
221
+ - "./lib/wgit/document_extractors.rb"
222
+ - "./lib/wgit/dsl.rb"
211
223
  - "./lib/wgit/indexer.rb"
212
224
  - "./lib/wgit/logger.rb"
213
225
  - "./lib/wgit/response.rb"
@@ -229,7 +241,7 @@ metadata:
229
241
  source_code_uri: https://github.com/michaeltelford/wgit
230
242
  changelog_uri: https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md
231
243
  bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
232
- documentation_uri: https://www.rubydoc.info/github/michaeltelford/wgit/master
244
+ documentation_uri: https://www.rubydoc.info/gems/wgit
233
245
  allowed_push_host: https://rubygems.org
234
246
  post_install_message: Added the 'wgit' executable to $PATH
235
247
  rdoc_options: []
@@ -237,18 +249,21 @@ require_paths:
237
249
  - lib
238
250
  required_ruby_version: !ruby/object:Gem::Requirement
239
251
  requirements:
240
- - - "~>"
252
+ - - ">="
253
+ - !ruby/object:Gem::Version
254
+ version: '2.6'
255
+ - - "<"
241
256
  - !ruby/object:Gem::Version
242
- version: '2.5'
257
+ version: '4'
243
258
  required_rubygems_version: !ruby/object:Gem::Requirement
244
259
  requirements:
245
260
  - - ">="
246
261
  - !ruby/object:Gem::Version
247
262
  version: '0'
248
263
  requirements: []
249
- rubygems_version: 3.0.6
250
- signing_key:
264
+ rubygems_version: 3.2.22
265
+ signing_key:
251
266
  specification_version: 4
252
- summary: Wgit is a Ruby library primarily used for crawling, indexing and searching
253
- HTML webpages.
267
+ summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
268
+ extract the data you want from the web.
254
269
  test_files: []