wgit 0.8.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +68 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -326
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +39 -0
- data/lib/wgit/crawler.rb +206 -76
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +145 -95
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +66 -163
- data/lib/wgit/response.rb +5 -2
- data/lib/wgit/url.rb +177 -63
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/url.rb
CHANGED
@@ -6,15 +6,15 @@ require 'uri'
|
|
6
6
|
require 'addressable/uri'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class modeling a web based HTTP URL.
|
9
|
+
# Class modeling/serialising a web based HTTP URL.
|
10
10
|
#
|
11
11
|
# Can be an internal/relative link e.g. "about.html" or an absolute URL
|
12
|
-
# e.g. "http://www.google.co.uk". Is a subclass of String and uses
|
13
|
-
#
|
12
|
+
# e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
|
13
|
+
# `addressable/uri` internally for parsing.
|
14
14
|
#
|
15
|
-
# Most of the methods in this class return new Wgit::Url instances making
|
16
|
-
# method calls chainable e.g. url.omit_base.omit_fragment etc. The
|
17
|
-
# also try to be idempotent where possible.
|
15
|
+
# Most of the methods in this class return new `Wgit::Url` instances making
|
16
|
+
# the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
|
17
|
+
# methods also try to be idempotent where possible.
|
18
18
|
class Url < String
|
19
19
|
include Assertable
|
20
20
|
|
@@ -28,7 +28,7 @@ module Wgit
|
|
28
28
|
# The duration of the crawl for this Url (in seconds).
|
29
29
|
attr_accessor :crawl_duration
|
30
30
|
|
31
|
-
# Initializes a new instance of Wgit::Url which
|
31
|
+
# Initializes a new instance of Wgit::Url which models a web based
|
32
32
|
# HTTP URL.
|
33
33
|
#
|
34
34
|
# @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
|
@@ -99,10 +99,10 @@ module Wgit
|
|
99
99
|
# @param obj [Object] The object to parse, which #is_a?(String).
|
100
100
|
# @raise [StandardError] If obj.is_a?(String) is false.
|
101
101
|
# @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
|
102
|
-
def self.
|
102
|
+
def self.parse?(obj)
|
103
103
|
parse(obj)
|
104
104
|
rescue Addressable::URI::InvalidURIError
|
105
|
-
Wgit.logger.debug("Wgit::Url.
|
105
|
+
Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
|
106
106
|
Addressable::URI::InvalidURIError")
|
107
107
|
nil
|
108
108
|
end
|
@@ -115,8 +115,6 @@ Addressable::URI::InvalidURIError")
|
|
115
115
|
def crawled=(bool)
|
116
116
|
@crawled = bool
|
117
117
|
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
118
|
-
|
119
|
-
bool
|
120
118
|
end
|
121
119
|
|
122
120
|
# Overrides String#replace setting the new_url @uri and String value.
|
@@ -146,10 +144,10 @@ Addressable::URI::InvalidURIError")
|
|
146
144
|
# @param opts [Hash] The options with which to check relativity. Only one
|
147
145
|
# opts param should be provided. The provided opts param Url must be
|
148
146
|
# absolute and be prefixed with a scheme. Consider using the output of
|
149
|
-
# Wgit::Url#
|
150
|
-
# @option opts [Wgit::Url, String] :
|
151
|
-
# http://www.google.com/how which gives a
|
152
|
-
# 'http://www.google.com'.
|
147
|
+
# Wgit::Url#to_origin which should work (unless it's nil).
|
148
|
+
# @option opts [Wgit::Url, String] :origin The Url origin e.g.
|
149
|
+
# http://www.google.com:81/how which gives a origin of
|
150
|
+
# 'http://www.google.com:81'.
|
153
151
|
# @option opts [Wgit::Url, String] :host The Url host e.g.
|
154
152
|
# http://www.google.com/how which gives a host of 'www.google.com'.
|
155
153
|
# @option opts [Wgit::Url, String] :domain The Url domain e.g.
|
@@ -160,10 +158,11 @@ Addressable::URI::InvalidURIError")
|
|
160
158
|
# param has been provided.
|
161
159
|
# @return [Boolean] True if relative, false if absolute.
|
162
160
|
def relative?(opts = {})
|
163
|
-
defaults = {
|
161
|
+
defaults = { origin: nil, host: nil, domain: nil, brand: nil }
|
164
162
|
opts = defaults.merge(opts)
|
165
163
|
raise 'Url (self) cannot be empty' if empty?
|
166
164
|
|
165
|
+
return false if scheme_relative?
|
167
166
|
return true if @uri.relative?
|
168
167
|
|
169
168
|
# Self is absolute but may be relative to the opts param e.g. host.
|
@@ -180,8 +179,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
180
179
|
end
|
181
180
|
|
182
181
|
case type
|
183
|
-
when :
|
184
|
-
|
182
|
+
when :origin # http://www.google.com:81
|
183
|
+
to_origin == url.to_origin
|
185
184
|
when :host # www.google.com
|
186
185
|
to_host == url.to_host
|
187
186
|
when :domain # google.com
|
@@ -206,8 +205,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
206
205
|
# @return [Boolean] True if valid, absolute and crawable, otherwise false.
|
207
206
|
def valid?
|
208
207
|
return false if relative?
|
209
|
-
return false unless
|
210
|
-
return false
|
208
|
+
return false unless to_origin && to_domain
|
209
|
+
return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
|
211
210
|
|
212
211
|
true
|
213
212
|
end
|
@@ -238,7 +237,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
238
237
|
Wgit::Url.new(concatted)
|
239
238
|
end
|
240
239
|
|
241
|
-
#
|
240
|
+
# Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
|
241
|
+
# This should be used before GET'ing the url, in case it has IRI chars.
|
242
242
|
#
|
243
243
|
# @return [Wgit::Url] An escaped version of self.
|
244
244
|
def normalize
|
@@ -249,8 +249,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
249
249
|
# modify the receiver.
|
250
250
|
#
|
251
251
|
# If self is absolute then it's returned as is, making this method
|
252
|
-
# idempotent. The doc's
|
253
|
-
# doc.url is used as the base; which is concatted with self.
|
252
|
+
# idempotent. The doc's `<base>` element is used if present, otherwise
|
253
|
+
# `doc.url` is used as the base; which is concatted with self.
|
254
254
|
#
|
255
255
|
# Typically used to build an absolute link obtained from a document.
|
256
256
|
#
|
@@ -258,35 +258,37 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
258
258
|
# link = Wgit::Url.new('/favicon.png')
|
259
259
|
# doc = Wgit::Document.new('http://example.com')
|
260
260
|
#
|
261
|
-
# link.
|
261
|
+
# link.make_absolute(doc) # => "http://example.com/favicon.png"
|
262
262
|
#
|
263
263
|
# @param doc [Wgit::Document] The doc whose base Url is concatted with
|
264
264
|
# self.
|
265
265
|
# @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
|
266
266
|
# raises an Exception.
|
267
267
|
# @return [Wgit::Url] Self in absolute form.
|
268
|
-
def
|
268
|
+
def make_absolute(doc)
|
269
269
|
assert_type(doc, Wgit::Document)
|
270
|
+
raise 'Cannot make absolute when Document @url is not valid' \
|
271
|
+
unless doc.url.valid?
|
272
|
+
|
273
|
+
return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
|
270
274
|
|
271
275
|
absolute? ? self : doc.base_url(link: self).concat(self)
|
272
276
|
end
|
273
277
|
|
274
|
-
# Returns self having prefixed a protocol
|
278
|
+
# Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
|
275
279
|
# Returns self even if absolute (with scheme); therefore is idempotent.
|
276
280
|
#
|
277
|
-
# @param
|
278
|
-
# @return [Wgit::Url] Self with a
|
279
|
-
def prefix_scheme(
|
280
|
-
|
281
|
-
|
282
|
-
case protocol
|
283
|
-
when :http
|
284
|
-
Wgit::Url.new("http://#{url}")
|
285
|
-
when :https
|
286
|
-
Wgit::Url.new("https://#{url}")
|
287
|
-
else
|
288
|
-
raise "protocol must be :http or :https, not :#{protocol}"
|
281
|
+
# @param scheme [Symbol] Either :http or :https.
|
282
|
+
# @return [Wgit::Url] Self with a scheme prefix.
|
283
|
+
def prefix_scheme(scheme = :http)
|
284
|
+
unless %i[http https].include?(scheme)
|
285
|
+
raise "scheme must be :http or :https, not :#{scheme}"
|
289
286
|
end
|
287
|
+
|
288
|
+
return self if absolute? && !scheme_relative?
|
289
|
+
|
290
|
+
separator = scheme_relative? ? '' : '//'
|
291
|
+
Wgit::Url.new("#{scheme}:#{separator}#{self}")
|
290
292
|
end
|
291
293
|
|
292
294
|
# Returns a Hash containing this Url's instance vars excluding @uri.
|
@@ -294,8 +296,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
294
296
|
#
|
295
297
|
# @return [Hash] self's instance vars as a Hash.
|
296
298
|
def to_h
|
297
|
-
|
298
|
-
h = Wgit::Utils.to_h(self, ignore: ignore)
|
299
|
+
h = Wgit::Utils.to_h(self, ignore: ['@uri'])
|
299
300
|
Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
|
300
301
|
end
|
301
302
|
|
@@ -338,6 +339,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
338
339
|
host ? Wgit::Url.new(host) : nil
|
339
340
|
end
|
340
341
|
|
342
|
+
# Returns a new Wgit::Url containing just the port of this URL e.g.
|
343
|
+
# Given http://www.google.co.uk:443/about.html, '443' is returned.
|
344
|
+
#
|
345
|
+
# @return [Wgit::Url, nil] Containing just the port or nil.
|
346
|
+
def to_port
|
347
|
+
port = @uri.port
|
348
|
+
|
349
|
+
# @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
|
350
|
+
return nil unless port
|
351
|
+
return nil unless include?(":#{port}")
|
352
|
+
|
353
|
+
Wgit::Url.new(port.to_s)
|
354
|
+
end
|
355
|
+
|
341
356
|
# Returns a new Wgit::Url containing just the domain of this URL e.g.
|
342
357
|
# Given http://www.google.co.uk/about.html, google.co.uk is returned.
|
343
358
|
#
|
@@ -347,6 +362,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
347
362
|
domain ? Wgit::Url.new(domain) : nil
|
348
363
|
end
|
349
364
|
|
365
|
+
# Returns a new Wgit::Url containing just the sub domain of this URL e.g.
|
366
|
+
# Given http://scripts.dev.google.com, scripts.dev is returned.
|
367
|
+
#
|
368
|
+
# @return [Wgit::Url, nil] Containing just the sub domain or nil.
|
369
|
+
def to_sub_domain
|
370
|
+
return nil unless to_host
|
371
|
+
|
372
|
+
dot_domain = ".#{to_domain}"
|
373
|
+
return nil unless include?(dot_domain)
|
374
|
+
|
375
|
+
sub_domain = to_host.sub(dot_domain, '')
|
376
|
+
Wgit::Url.new(sub_domain)
|
377
|
+
end
|
378
|
+
|
350
379
|
# Returns a new Wgit::Url containing just the brand of this URL e.g.
|
351
380
|
# Given http://www.google.co.uk/about.html, google is returned.
|
352
381
|
#
|
@@ -362,12 +391,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
362
391
|
# @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
|
363
392
|
# nil.
|
364
393
|
def to_base
|
365
|
-
return nil
|
394
|
+
return nil unless @uri.scheme && @uri.host
|
366
395
|
|
367
396
|
base = "#{@uri.scheme}://#{@uri.host}"
|
368
397
|
Wgit::Url.new(base)
|
369
398
|
end
|
370
399
|
|
400
|
+
# Returns only the origin of this URL e.g. the protocol scheme, host and
|
401
|
+
# port combined. For http://localhost:3000/api, http://localhost:3000 gets
|
402
|
+
# returned. If there's no port present, then to_base is returned.
|
403
|
+
#
|
404
|
+
# @return [Wgit::Url, nil] The origin of self or nil.
|
405
|
+
def to_origin
|
406
|
+
return nil unless to_base
|
407
|
+
return to_base unless to_port
|
408
|
+
|
409
|
+
Wgit::Url.new("#{to_base}:#{to_port}")
|
410
|
+
end
|
411
|
+
|
371
412
|
# Returns the path of this URL e.g. the bit after the host without slashes.
|
372
413
|
# For example:
|
373
414
|
# Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
|
@@ -396,7 +437,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
396
437
|
end
|
397
438
|
|
398
439
|
# Returns a new Wgit::Url containing just the query string of this URL
|
399
|
-
# e.g. Given http://google.com?q=
|
440
|
+
# e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
|
400
441
|
#
|
401
442
|
# @return [Wgit::Url, nil] Containing just the query string or nil.
|
402
443
|
def to_query
|
@@ -404,6 +445,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
404
445
|
query ? Wgit::Url.new(query) : nil
|
405
446
|
end
|
406
447
|
|
448
|
+
# Returns a Hash containing just the query string parameters of this URL
|
449
|
+
# e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
|
450
|
+
#
|
451
|
+
# @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
|
452
|
+
# true, Strings otherwise.
|
453
|
+
# @return [Hash<String | Symbol, String>] Containing the query string
|
454
|
+
# params or empty if the URL doesn't contain any query parameters.
|
455
|
+
def to_query_hash(symbolize_keys: false)
|
456
|
+
query_str = to_query
|
457
|
+
return {} unless query_str
|
458
|
+
|
459
|
+
query_str.split('&').each_with_object({}) do |param, hash|
|
460
|
+
k, v = param.split('=')
|
461
|
+
k = k.to_sym if symbolize_keys
|
462
|
+
hash[k] = v
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
407
466
|
# Returns a new Wgit::Url containing just the fragment string of this URL
|
408
467
|
# e.g. Given http://google.com#about, #about is returned.
|
409
468
|
#
|
@@ -425,6 +484,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
425
484
|
segs.length > 1 ? Wgit::Url.new(segs.last) : nil
|
426
485
|
end
|
427
486
|
|
487
|
+
# Returns a new Wgit::Url containing just the username string of this URL
|
488
|
+
# e.g. Given http://me:pass1@example.com, me is returned.
|
489
|
+
#
|
490
|
+
# @return [Wgit::Url, nil] Containing just the user string or nil.
|
491
|
+
def to_user
|
492
|
+
user = @uri.user
|
493
|
+
user ? Wgit::Url.new(user) : nil
|
494
|
+
end
|
495
|
+
|
496
|
+
# Returns a new Wgit::Url containing just the password string of this URL
|
497
|
+
# e.g. Given http://me:pass1@example.com, pass1 is returned.
|
498
|
+
#
|
499
|
+
# @return [Wgit::Url, nil] Containing just the password string or nil.
|
500
|
+
def to_password
|
501
|
+
password = @uri.password
|
502
|
+
password ? Wgit::Url.new(password) : nil
|
503
|
+
end
|
504
|
+
|
428
505
|
# Omits the given URL components from self and returns a new Wgit::Url.
|
429
506
|
#
|
430
507
|
# Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
|
@@ -468,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
468
545
|
.omit_trailing_slash
|
469
546
|
end
|
470
547
|
|
471
|
-
# Returns a new Wgit::Url with the base (
|
548
|
+
# Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
|
472
549
|
# http://google.com/search?q=something#about, search?q=something#about is
|
473
550
|
# returned. If relative and base isn't present then self is returned.
|
474
551
|
# Leading and trailing slashes are always stripped from the return value.
|
@@ -483,6 +560,21 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
483
560
|
Wgit::Url.new(omit_base).omit_slashes
|
484
561
|
end
|
485
562
|
|
563
|
+
# Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
|
564
|
+
# http://google.com:81/search?q=something#about, search?q=something#about is
|
565
|
+
# returned. If relative and base isn't present then self is returned.
|
566
|
+
# Leading and trailing slashes are always stripped from the return value.
|
567
|
+
#
|
568
|
+
# @return [Wgit::Url] Self containing everything after the origin.
|
569
|
+
def omit_origin
|
570
|
+
origin = to_origin
|
571
|
+
omit_origin = origin ? gsub(origin, '') : self
|
572
|
+
|
573
|
+
return self if ['', '/'].include?(omit_origin)
|
574
|
+
|
575
|
+
Wgit::Url.new(omit_origin).omit_slashes
|
576
|
+
end
|
577
|
+
|
486
578
|
# Returns a new Wgit::Url with the query string portion removed e.g. Given
|
487
579
|
# http://google.com/search?q=hello, http://google.com/search is
|
488
580
|
# returned. Self is returned as is if no query string is present. A URL
|
@@ -528,25 +620,47 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
528
620
|
start_with?('#')
|
529
621
|
end
|
530
622
|
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
alias
|
547
|
-
alias
|
548
|
-
alias
|
549
|
-
alias
|
550
|
-
alias
|
623
|
+
# Returns true if self equals '/' a.k.a. index.
|
624
|
+
#
|
625
|
+
# @return [Boolean] True if self equals '/', false otherwise.
|
626
|
+
def index?
|
627
|
+
self == '/'
|
628
|
+
end
|
629
|
+
|
630
|
+
# Returns true if self starts with '//' a.k.a a scheme/protocol relative
|
631
|
+
# path.
|
632
|
+
#
|
633
|
+
# @return [Boolean] True if self starts with '//', false otherwise.
|
634
|
+
def scheme_relative?
|
635
|
+
start_with?('//')
|
636
|
+
end
|
637
|
+
|
638
|
+
alias + concat
|
639
|
+
alias crawled? crawled
|
640
|
+
alias is_relative? relative?
|
641
|
+
alias is_absolute? absolute?
|
642
|
+
alias is_valid? valid?
|
643
|
+
alias is_query? query?
|
644
|
+
alias is_fragment? fragment?
|
645
|
+
alias is_index? index?
|
646
|
+
alias is_scheme_relative? scheme_relative?
|
647
|
+
alias uri to_uri
|
648
|
+
alias url to_url
|
649
|
+
alias scheme to_scheme
|
650
|
+
alias host to_host
|
651
|
+
alias port to_port
|
652
|
+
alias domain to_domain
|
653
|
+
alias brand to_brand
|
654
|
+
alias base to_base
|
655
|
+
alias origin to_origin
|
656
|
+
alias path to_path
|
657
|
+
alias endpoint to_endpoint
|
658
|
+
alias query to_query
|
659
|
+
alias query_hash to_query_hash
|
660
|
+
alias fragment to_fragment
|
661
|
+
alias extension to_extension
|
662
|
+
alias user to_user
|
663
|
+
alias password to_password
|
664
|
+
alias sub_domain to_sub_domain
|
551
665
|
end
|
552
666
|
end
|
data/lib/wgit/utils.rb
CHANGED
@@ -145,7 +145,8 @@ module Wgit
|
|
145
145
|
# @param keyword_limit [Integer] The max amount of keywords to be
|
146
146
|
# outputted to the stream.
|
147
147
|
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
148
|
-
# to output text somewhere e.g. a file or
|
148
|
+
# to output text somewhere e.g. a file or STDERR.
|
149
|
+
# @return [Integer] The number of results.
|
149
150
|
def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
|
150
151
|
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
151
152
|
|
@@ -162,18 +163,37 @@ module Wgit
|
|
162
163
|
stream.puts
|
163
164
|
end
|
164
165
|
|
165
|
-
|
166
|
+
results.size
|
166
167
|
end
|
167
168
|
|
168
|
-
#
|
169
|
+
# Sanitises the obj to make it uniform by calling the correct sanitize_*
|
170
|
+
# method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
|
171
|
+
# not in the case statement will be ignored and returned as is.
|
172
|
+
#
|
173
|
+
# @param obj [Object] The object to be sanitized.
|
174
|
+
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
175
|
+
# invalid characters.
|
176
|
+
# @return [Object] The sanitized obj is both modified and then returned.
|
177
|
+
def self.sanitize(obj, encode: true)
|
178
|
+
case obj
|
179
|
+
when String
|
180
|
+
sanitize_str(obj, encode: encode)
|
181
|
+
when Array
|
182
|
+
sanitize_arr(obj, encode: encode)
|
183
|
+
else
|
184
|
+
obj
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Sanitises a String to make it uniform. Strips any leading/trailing white
|
169
189
|
# space. Also applies UTF-8 encoding (replacing invalid characters) if
|
170
190
|
# `encode: true`.
|
171
191
|
#
|
172
|
-
# @param str [String] The String to
|
192
|
+
# @param str [String] The String to sanitize. str is modified.
|
173
193
|
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
174
194
|
# invalid characters.
|
175
|
-
# @return [String] The
|
176
|
-
def self.
|
195
|
+
# @return [String] The sanitized str is both modified and then returned.
|
196
|
+
def self.sanitize_str(str, encode: true)
|
177
197
|
if str.is_a?(String)
|
178
198
|
str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
|
179
199
|
str.strip!
|
@@ -182,15 +202,15 @@ module Wgit
|
|
182
202
|
str
|
183
203
|
end
|
184
204
|
|
185
|
-
#
|
186
|
-
# processes non empty Strings using Wgit::Utils.
|
205
|
+
# Sanitises an Array to make it uniform. Removes empty Strings and nils,
|
206
|
+
# processes non empty Strings using Wgit::Utils.sanitize and removes
|
187
207
|
# duplicates.
|
188
208
|
#
|
189
|
-
# @param arr [Enumerable] The Array to
|
190
|
-
# @return [Enumerable] The
|
191
|
-
def self.
|
209
|
+
# @param arr [Enumerable] The Array to sanitize. arr is modified.
|
210
|
+
# @return [Enumerable] The sanitized arr is both modified and then returned.
|
211
|
+
def self.sanitize_arr(arr, encode: true)
|
192
212
|
if arr.is_a?(Array)
|
193
|
-
arr.map! { |str|
|
213
|
+
arr.map! { |str| sanitize(str, encode: encode) }
|
194
214
|
arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
195
215
|
arr.compact!
|
196
216
|
arr.uniq!
|
@@ -198,13 +218,5 @@ module Wgit
|
|
198
218
|
|
199
219
|
arr
|
200
220
|
end
|
201
|
-
|
202
|
-
# Returns the model having removed non bson types (for use with MongoDB).
|
203
|
-
#
|
204
|
-
# @param model_hash [Hash] The model Hash to process.
|
205
|
-
# @return [Hash] The model Hash with non bson types removed.
|
206
|
-
def self.remove_non_bson_types(model_hash)
|
207
|
-
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
208
|
-
end
|
209
221
|
end
|
210
222
|
end
|
data/lib/wgit/version.rb
CHANGED
@@ -2,10 +2,11 @@
|
|
2
2
|
|
3
3
|
# Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
|
4
4
|
# contents for later use.
|
5
|
+
#
|
5
6
|
# @author Michael Telford
|
6
7
|
module Wgit
|
7
8
|
# The current gem version of Wgit.
|
8
|
-
VERSION = '0.
|
9
|
+
VERSION = '0.10.2'
|
9
10
|
|
10
11
|
# Returns the current gem version of Wgit as a String.
|
11
12
|
def self.version
|
data/lib/wgit.rb
CHANGED
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
|
|
6
6
|
require_relative 'wgit/utils'
|
7
7
|
require_relative 'wgit/url'
|
8
8
|
require_relative 'wgit/document'
|
9
|
-
require_relative 'wgit/
|
9
|
+
require_relative 'wgit/document_extractors'
|
10
10
|
require_relative 'wgit/crawler'
|
11
11
|
require_relative 'wgit/database/model'
|
12
12
|
require_relative 'wgit/database/database'
|
13
13
|
require_relative 'wgit/indexer'
|
14
|
+
require_relative 'wgit/dsl'
|
15
|
+
require_relative 'wgit/base'
|
14
16
|
# require_relative 'wgit/core_ext' - Must be explicitly required.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-11-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.3'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: ferrum
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.8'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.8'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: byebug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,14 +198,10 @@ dependencies:
|
|
184
198
|
- - "<"
|
185
199
|
- !ruby/object:Gem::Version
|
186
200
|
version: '1.0'
|
187
|
-
description: '
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
WWW search engine. The Wgit API is easily extended allowing you to pull out the
|
192
|
-
parts of a webpage that are important to you, the code snippets or tables for example.
|
193
|
-
As Wgit is a library, it supports many different use cases including data mining,
|
194
|
-
analytics, web indexing and URL parsing to name a few.
|
201
|
+
description: 'Wgit was primarily designed to crawl static HTML websites to index and
|
202
|
+
search their content - providing the basis of any search engine; but Wgit is suitable
|
203
|
+
for many application domains including: URL parsing, data mining and statistical
|
204
|
+
analysis.
|
195
205
|
|
196
206
|
'
|
197
207
|
email: michael.telford@live.com
|
@@ -202,12 +212,14 @@ extra_rdoc_files: []
|
|
202
212
|
files:
|
203
213
|
- "./lib/wgit.rb"
|
204
214
|
- "./lib/wgit/assertable.rb"
|
215
|
+
- "./lib/wgit/base.rb"
|
205
216
|
- "./lib/wgit/core_ext.rb"
|
206
217
|
- "./lib/wgit/crawler.rb"
|
207
218
|
- "./lib/wgit/database/database.rb"
|
208
219
|
- "./lib/wgit/database/model.rb"
|
209
220
|
- "./lib/wgit/document.rb"
|
210
|
-
- "./lib/wgit/
|
221
|
+
- "./lib/wgit/document_extractors.rb"
|
222
|
+
- "./lib/wgit/dsl.rb"
|
211
223
|
- "./lib/wgit/indexer.rb"
|
212
224
|
- "./lib/wgit/logger.rb"
|
213
225
|
- "./lib/wgit/response.rb"
|
@@ -229,7 +241,7 @@ metadata:
|
|
229
241
|
source_code_uri: https://github.com/michaeltelford/wgit
|
230
242
|
changelog_uri: https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md
|
231
243
|
bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
|
232
|
-
documentation_uri: https://www.rubydoc.info/
|
244
|
+
documentation_uri: https://www.rubydoc.info/gems/wgit
|
233
245
|
allowed_push_host: https://rubygems.org
|
234
246
|
post_install_message: Added the 'wgit' executable to $PATH
|
235
247
|
rdoc_options: []
|
@@ -237,18 +249,21 @@ require_paths:
|
|
237
249
|
- lib
|
238
250
|
required_ruby_version: !ruby/object:Gem::Requirement
|
239
251
|
requirements:
|
240
|
-
- - "
|
252
|
+
- - ">="
|
253
|
+
- !ruby/object:Gem::Version
|
254
|
+
version: '2.6'
|
255
|
+
- - "<"
|
241
256
|
- !ruby/object:Gem::Version
|
242
|
-
version: '
|
257
|
+
version: '4'
|
243
258
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
244
259
|
requirements:
|
245
260
|
- - ">="
|
246
261
|
- !ruby/object:Gem::Version
|
247
262
|
version: '0'
|
248
263
|
requirements: []
|
249
|
-
rubygems_version: 3.
|
250
|
-
signing_key:
|
264
|
+
rubygems_version: 3.2.22
|
265
|
+
signing_key:
|
251
266
|
specification_version: 4
|
252
|
-
summary: Wgit is a
|
253
|
-
|
267
|
+
summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
|
268
|
+
extract the data you want from the web.
|
254
269
|
test_files: []
|