wgit 0.8.0 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +68 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -326
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +39 -0
- data/lib/wgit/crawler.rb +206 -76
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +145 -95
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +66 -163
- data/lib/wgit/response.rb +5 -2
- data/lib/wgit/url.rb +177 -63
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/url.rb
CHANGED
@@ -6,15 +6,15 @@ require 'uri'
|
|
6
6
|
require 'addressable/uri'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class modeling a web based HTTP URL.
|
9
|
+
# Class modeling/serialising a web based HTTP URL.
|
10
10
|
#
|
11
11
|
# Can be an internal/relative link e.g. "about.html" or an absolute URL
|
12
|
-
# e.g. "http://www.google.co.uk". Is a subclass of String and uses
|
13
|
-
#
|
12
|
+
# e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
|
13
|
+
# `addressable/uri` internally for parsing.
|
14
14
|
#
|
15
|
-
# Most of the methods in this class return new Wgit::Url instances making
|
16
|
-
# method calls chainable e.g. url.omit_base.omit_fragment etc. The
|
17
|
-
# also try to be idempotent where possible.
|
15
|
+
# Most of the methods in this class return new `Wgit::Url` instances making
|
16
|
+
# the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
|
17
|
+
# methods also try to be idempotent where possible.
|
18
18
|
class Url < String
|
19
19
|
include Assertable
|
20
20
|
|
@@ -28,7 +28,7 @@ module Wgit
|
|
28
28
|
# The duration of the crawl for this Url (in seconds).
|
29
29
|
attr_accessor :crawl_duration
|
30
30
|
|
31
|
-
# Initializes a new instance of Wgit::Url which
|
31
|
+
# Initializes a new instance of Wgit::Url which models a web based
|
32
32
|
# HTTP URL.
|
33
33
|
#
|
34
34
|
# @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
|
@@ -99,10 +99,10 @@ module Wgit
|
|
99
99
|
# @param obj [Object] The object to parse, which #is_a?(String).
|
100
100
|
# @raise [StandardError] If obj.is_a?(String) is false.
|
101
101
|
# @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
|
102
|
-
def self.
|
102
|
+
def self.parse?(obj)
|
103
103
|
parse(obj)
|
104
104
|
rescue Addressable::URI::InvalidURIError
|
105
|
-
Wgit.logger.debug("Wgit::Url.
|
105
|
+
Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
|
106
106
|
Addressable::URI::InvalidURIError")
|
107
107
|
nil
|
108
108
|
end
|
@@ -115,8 +115,6 @@ Addressable::URI::InvalidURIError")
|
|
115
115
|
def crawled=(bool)
|
116
116
|
@crawled = bool
|
117
117
|
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
118
|
-
|
119
|
-
bool
|
120
118
|
end
|
121
119
|
|
122
120
|
# Overrides String#replace setting the new_url @uri and String value.
|
@@ -146,10 +144,10 @@ Addressable::URI::InvalidURIError")
|
|
146
144
|
# @param opts [Hash] The options with which to check relativity. Only one
|
147
145
|
# opts param should be provided. The provided opts param Url must be
|
148
146
|
# absolute and be prefixed with a scheme. Consider using the output of
|
149
|
-
# Wgit::Url#
|
150
|
-
# @option opts [Wgit::Url, String] :
|
151
|
-
# http://www.google.com/how which gives a
|
152
|
-
# 'http://www.google.com'.
|
147
|
+
# Wgit::Url#to_origin which should work (unless it's nil).
|
148
|
+
# @option opts [Wgit::Url, String] :origin The Url origin e.g.
|
149
|
+
# http://www.google.com:81/how which gives a origin of
|
150
|
+
# 'http://www.google.com:81'.
|
153
151
|
# @option opts [Wgit::Url, String] :host The Url host e.g.
|
154
152
|
# http://www.google.com/how which gives a host of 'www.google.com'.
|
155
153
|
# @option opts [Wgit::Url, String] :domain The Url domain e.g.
|
@@ -160,10 +158,11 @@ Addressable::URI::InvalidURIError")
|
|
160
158
|
# param has been provided.
|
161
159
|
# @return [Boolean] True if relative, false if absolute.
|
162
160
|
def relative?(opts = {})
|
163
|
-
defaults = {
|
161
|
+
defaults = { origin: nil, host: nil, domain: nil, brand: nil }
|
164
162
|
opts = defaults.merge(opts)
|
165
163
|
raise 'Url (self) cannot be empty' if empty?
|
166
164
|
|
165
|
+
return false if scheme_relative?
|
167
166
|
return true if @uri.relative?
|
168
167
|
|
169
168
|
# Self is absolute but may be relative to the opts param e.g. host.
|
@@ -180,8 +179,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
180
179
|
end
|
181
180
|
|
182
181
|
case type
|
183
|
-
when :
|
184
|
-
|
182
|
+
when :origin # http://www.google.com:81
|
183
|
+
to_origin == url.to_origin
|
185
184
|
when :host # www.google.com
|
186
185
|
to_host == url.to_host
|
187
186
|
when :domain # google.com
|
@@ -206,8 +205,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
206
205
|
# @return [Boolean] True if valid, absolute and crawable, otherwise false.
|
207
206
|
def valid?
|
208
207
|
return false if relative?
|
209
|
-
return false unless
|
210
|
-
return false
|
208
|
+
return false unless to_origin && to_domain
|
209
|
+
return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
|
211
210
|
|
212
211
|
true
|
213
212
|
end
|
@@ -238,7 +237,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
238
237
|
Wgit::Url.new(concatted)
|
239
238
|
end
|
240
239
|
|
241
|
-
#
|
240
|
+
# Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
|
241
|
+
# This should be used before GET'ing the url, in case it has IRI chars.
|
242
242
|
#
|
243
243
|
# @return [Wgit::Url] An escaped version of self.
|
244
244
|
def normalize
|
@@ -249,8 +249,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
249
249
|
# modify the receiver.
|
250
250
|
#
|
251
251
|
# If self is absolute then it's returned as is, making this method
|
252
|
-
# idempotent. The doc's
|
253
|
-
# doc.url is used as the base; which is concatted with self.
|
252
|
+
# idempotent. The doc's `<base>` element is used if present, otherwise
|
253
|
+
# `doc.url` is used as the base; which is concatted with self.
|
254
254
|
#
|
255
255
|
# Typically used to build an absolute link obtained from a document.
|
256
256
|
#
|
@@ -258,35 +258,37 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
258
258
|
# link = Wgit::Url.new('/favicon.png')
|
259
259
|
# doc = Wgit::Document.new('http://example.com')
|
260
260
|
#
|
261
|
-
# link.
|
261
|
+
# link.make_absolute(doc) # => "http://example.com/favicon.png"
|
262
262
|
#
|
263
263
|
# @param doc [Wgit::Document] The doc whose base Url is concatted with
|
264
264
|
# self.
|
265
265
|
# @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
|
266
266
|
# raises an Exception.
|
267
267
|
# @return [Wgit::Url] Self in absolute form.
|
268
|
-
def
|
268
|
+
def make_absolute(doc)
|
269
269
|
assert_type(doc, Wgit::Document)
|
270
|
+
raise 'Cannot make absolute when Document @url is not valid' \
|
271
|
+
unless doc.url.valid?
|
272
|
+
|
273
|
+
return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
|
270
274
|
|
271
275
|
absolute? ? self : doc.base_url(link: self).concat(self)
|
272
276
|
end
|
273
277
|
|
274
|
-
# Returns self having prefixed a protocol
|
278
|
+
# Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
|
275
279
|
# Returns self even if absolute (with scheme); therefore is idempotent.
|
276
280
|
#
|
277
|
-
# @param
|
278
|
-
# @return [Wgit::Url] Self with a
|
279
|
-
def prefix_scheme(
|
280
|
-
|
281
|
-
|
282
|
-
case protocol
|
283
|
-
when :http
|
284
|
-
Wgit::Url.new("http://#{url}")
|
285
|
-
when :https
|
286
|
-
Wgit::Url.new("https://#{url}")
|
287
|
-
else
|
288
|
-
raise "protocol must be :http or :https, not :#{protocol}"
|
281
|
+
# @param scheme [Symbol] Either :http or :https.
|
282
|
+
# @return [Wgit::Url] Self with a scheme prefix.
|
283
|
+
def prefix_scheme(scheme = :http)
|
284
|
+
unless %i[http https].include?(scheme)
|
285
|
+
raise "scheme must be :http or :https, not :#{scheme}"
|
289
286
|
end
|
287
|
+
|
288
|
+
return self if absolute? && !scheme_relative?
|
289
|
+
|
290
|
+
separator = scheme_relative? ? '' : '//'
|
291
|
+
Wgit::Url.new("#{scheme}:#{separator}#{self}")
|
290
292
|
end
|
291
293
|
|
292
294
|
# Returns a Hash containing this Url's instance vars excluding @uri.
|
@@ -294,8 +296,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
294
296
|
#
|
295
297
|
# @return [Hash] self's instance vars as a Hash.
|
296
298
|
def to_h
|
297
|
-
|
298
|
-
h = Wgit::Utils.to_h(self, ignore: ignore)
|
299
|
+
h = Wgit::Utils.to_h(self, ignore: ['@uri'])
|
299
300
|
Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
|
300
301
|
end
|
301
302
|
|
@@ -338,6 +339,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
338
339
|
host ? Wgit::Url.new(host) : nil
|
339
340
|
end
|
340
341
|
|
342
|
+
# Returns a new Wgit::Url containing just the port of this URL e.g.
|
343
|
+
# Given http://www.google.co.uk:443/about.html, '443' is returned.
|
344
|
+
#
|
345
|
+
# @return [Wgit::Url, nil] Containing just the port or nil.
|
346
|
+
def to_port
|
347
|
+
port = @uri.port
|
348
|
+
|
349
|
+
# @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
|
350
|
+
return nil unless port
|
351
|
+
return nil unless include?(":#{port}")
|
352
|
+
|
353
|
+
Wgit::Url.new(port.to_s)
|
354
|
+
end
|
355
|
+
|
341
356
|
# Returns a new Wgit::Url containing just the domain of this URL e.g.
|
342
357
|
# Given http://www.google.co.uk/about.html, google.co.uk is returned.
|
343
358
|
#
|
@@ -347,6 +362,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
347
362
|
domain ? Wgit::Url.new(domain) : nil
|
348
363
|
end
|
349
364
|
|
365
|
+
# Returns a new Wgit::Url containing just the sub domain of this URL e.g.
|
366
|
+
# Given http://scripts.dev.google.com, scripts.dev is returned.
|
367
|
+
#
|
368
|
+
# @return [Wgit::Url, nil] Containing just the sub domain or nil.
|
369
|
+
def to_sub_domain
|
370
|
+
return nil unless to_host
|
371
|
+
|
372
|
+
dot_domain = ".#{to_domain}"
|
373
|
+
return nil unless include?(dot_domain)
|
374
|
+
|
375
|
+
sub_domain = to_host.sub(dot_domain, '')
|
376
|
+
Wgit::Url.new(sub_domain)
|
377
|
+
end
|
378
|
+
|
350
379
|
# Returns a new Wgit::Url containing just the brand of this URL e.g.
|
351
380
|
# Given http://www.google.co.uk/about.html, google is returned.
|
352
381
|
#
|
@@ -362,12 +391,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
362
391
|
# @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
|
363
392
|
# nil.
|
364
393
|
def to_base
|
365
|
-
return nil
|
394
|
+
return nil unless @uri.scheme && @uri.host
|
366
395
|
|
367
396
|
base = "#{@uri.scheme}://#{@uri.host}"
|
368
397
|
Wgit::Url.new(base)
|
369
398
|
end
|
370
399
|
|
400
|
+
# Returns only the origin of this URL e.g. the protocol scheme, host and
|
401
|
+
# port combined. For http://localhost:3000/api, http://localhost:3000 gets
|
402
|
+
# returned. If there's no port present, then to_base is returned.
|
403
|
+
#
|
404
|
+
# @return [Wgit::Url, nil] The origin of self or nil.
|
405
|
+
def to_origin
|
406
|
+
return nil unless to_base
|
407
|
+
return to_base unless to_port
|
408
|
+
|
409
|
+
Wgit::Url.new("#{to_base}:#{to_port}")
|
410
|
+
end
|
411
|
+
|
371
412
|
# Returns the path of this URL e.g. the bit after the host without slashes.
|
372
413
|
# For example:
|
373
414
|
# Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
|
@@ -396,7 +437,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
396
437
|
end
|
397
438
|
|
398
439
|
# Returns a new Wgit::Url containing just the query string of this URL
|
399
|
-
# e.g. Given http://google.com?q=
|
440
|
+
# e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
|
400
441
|
#
|
401
442
|
# @return [Wgit::Url, nil] Containing just the query string or nil.
|
402
443
|
def to_query
|
@@ -404,6 +445,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
404
445
|
query ? Wgit::Url.new(query) : nil
|
405
446
|
end
|
406
447
|
|
448
|
+
# Returns a Hash containing just the query string parameters of this URL
|
449
|
+
# e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
|
450
|
+
#
|
451
|
+
# @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
|
452
|
+
# true, Strings otherwise.
|
453
|
+
# @return [Hash<String | Symbol, String>] Containing the query string
|
454
|
+
# params or empty if the URL doesn't contain any query parameters.
|
455
|
+
def to_query_hash(symbolize_keys: false)
|
456
|
+
query_str = to_query
|
457
|
+
return {} unless query_str
|
458
|
+
|
459
|
+
query_str.split('&').each_with_object({}) do |param, hash|
|
460
|
+
k, v = param.split('=')
|
461
|
+
k = k.to_sym if symbolize_keys
|
462
|
+
hash[k] = v
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
407
466
|
# Returns a new Wgit::Url containing just the fragment string of this URL
|
408
467
|
# e.g. Given http://google.com#about, #about is returned.
|
409
468
|
#
|
@@ -425,6 +484,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
425
484
|
segs.length > 1 ? Wgit::Url.new(segs.last) : nil
|
426
485
|
end
|
427
486
|
|
487
|
+
# Returns a new Wgit::Url containing just the username string of this URL
|
488
|
+
# e.g. Given http://me:pass1@example.com, me is returned.
|
489
|
+
#
|
490
|
+
# @return [Wgit::Url, nil] Containing just the user string or nil.
|
491
|
+
def to_user
|
492
|
+
user = @uri.user
|
493
|
+
user ? Wgit::Url.new(user) : nil
|
494
|
+
end
|
495
|
+
|
496
|
+
# Returns a new Wgit::Url containing just the password string of this URL
|
497
|
+
# e.g. Given http://me:pass1@example.com, pass1 is returned.
|
498
|
+
#
|
499
|
+
# @return [Wgit::Url, nil] Containing just the password string or nil.
|
500
|
+
def to_password
|
501
|
+
password = @uri.password
|
502
|
+
password ? Wgit::Url.new(password) : nil
|
503
|
+
end
|
504
|
+
|
428
505
|
# Omits the given URL components from self and returns a new Wgit::Url.
|
429
506
|
#
|
430
507
|
# Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
|
@@ -468,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
468
545
|
.omit_trailing_slash
|
469
546
|
end
|
470
547
|
|
471
|
-
# Returns a new Wgit::Url with the base (
|
548
|
+
# Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
|
472
549
|
# http://google.com/search?q=something#about, search?q=something#about is
|
473
550
|
# returned. If relative and base isn't present then self is returned.
|
474
551
|
# Leading and trailing slashes are always stripped from the return value.
|
@@ -483,6 +560,21 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
483
560
|
Wgit::Url.new(omit_base).omit_slashes
|
484
561
|
end
|
485
562
|
|
563
|
+
# Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
|
564
|
+
# http://google.com:81/search?q=something#about, search?q=something#about is
|
565
|
+
# returned. If relative and base isn't present then self is returned.
|
566
|
+
# Leading and trailing slashes are always stripped from the return value.
|
567
|
+
#
|
568
|
+
# @return [Wgit::Url] Self containing everything after the origin.
|
569
|
+
def omit_origin
|
570
|
+
origin = to_origin
|
571
|
+
omit_origin = origin ? gsub(origin, '') : self
|
572
|
+
|
573
|
+
return self if ['', '/'].include?(omit_origin)
|
574
|
+
|
575
|
+
Wgit::Url.new(omit_origin).omit_slashes
|
576
|
+
end
|
577
|
+
|
486
578
|
# Returns a new Wgit::Url with the query string portion removed e.g. Given
|
487
579
|
# http://google.com/search?q=hello, http://google.com/search is
|
488
580
|
# returned. Self is returned as is if no query string is present. A URL
|
@@ -528,25 +620,47 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
528
620
|
start_with?('#')
|
529
621
|
end
|
530
622
|
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
alias
|
547
|
-
alias
|
548
|
-
alias
|
549
|
-
alias
|
550
|
-
alias
|
623
|
+
# Returns true if self equals '/' a.k.a. index.
|
624
|
+
#
|
625
|
+
# @return [Boolean] True if self equals '/', false otherwise.
|
626
|
+
def index?
|
627
|
+
self == '/'
|
628
|
+
end
|
629
|
+
|
630
|
+
# Returns true if self starts with '//' a.k.a a scheme/protocol relative
|
631
|
+
# path.
|
632
|
+
#
|
633
|
+
# @return [Boolean] True if self starts with '//', false otherwise.
|
634
|
+
def scheme_relative?
|
635
|
+
start_with?('//')
|
636
|
+
end
|
637
|
+
|
638
|
+
alias + concat
|
639
|
+
alias crawled? crawled
|
640
|
+
alias is_relative? relative?
|
641
|
+
alias is_absolute? absolute?
|
642
|
+
alias is_valid? valid?
|
643
|
+
alias is_query? query?
|
644
|
+
alias is_fragment? fragment?
|
645
|
+
alias is_index? index?
|
646
|
+
alias is_scheme_relative? scheme_relative?
|
647
|
+
alias uri to_uri
|
648
|
+
alias url to_url
|
649
|
+
alias scheme to_scheme
|
650
|
+
alias host to_host
|
651
|
+
alias port to_port
|
652
|
+
alias domain to_domain
|
653
|
+
alias brand to_brand
|
654
|
+
alias base to_base
|
655
|
+
alias origin to_origin
|
656
|
+
alias path to_path
|
657
|
+
alias endpoint to_endpoint
|
658
|
+
alias query to_query
|
659
|
+
alias query_hash to_query_hash
|
660
|
+
alias fragment to_fragment
|
661
|
+
alias extension to_extension
|
662
|
+
alias user to_user
|
663
|
+
alias password to_password
|
664
|
+
alias sub_domain to_sub_domain
|
551
665
|
end
|
552
666
|
end
|
data/lib/wgit/utils.rb
CHANGED
@@ -145,7 +145,8 @@ module Wgit
|
|
145
145
|
# @param keyword_limit [Integer] The max amount of keywords to be
|
146
146
|
# outputted to the stream.
|
147
147
|
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
148
|
-
# to output text somewhere e.g. a file or
|
148
|
+
# to output text somewhere e.g. a file or STDERR.
|
149
|
+
# @return [Integer] The number of results.
|
149
150
|
def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
|
150
151
|
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
151
152
|
|
@@ -162,18 +163,37 @@ module Wgit
|
|
162
163
|
stream.puts
|
163
164
|
end
|
164
165
|
|
165
|
-
|
166
|
+
results.size
|
166
167
|
end
|
167
168
|
|
168
|
-
#
|
169
|
+
# Sanitises the obj to make it uniform by calling the correct sanitize_*
|
170
|
+
# method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
|
171
|
+
# not in the case statement will be ignored and returned as is.
|
172
|
+
#
|
173
|
+
# @param obj [Object] The object to be sanitized.
|
174
|
+
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
175
|
+
# invalid characters.
|
176
|
+
# @return [Object] The sanitized obj is both modified and then returned.
|
177
|
+
def self.sanitize(obj, encode: true)
|
178
|
+
case obj
|
179
|
+
when String
|
180
|
+
sanitize_str(obj, encode: encode)
|
181
|
+
when Array
|
182
|
+
sanitize_arr(obj, encode: encode)
|
183
|
+
else
|
184
|
+
obj
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Sanitises a String to make it uniform. Strips any leading/trailing white
|
169
189
|
# space. Also applies UTF-8 encoding (replacing invalid characters) if
|
170
190
|
# `encode: true`.
|
171
191
|
#
|
172
|
-
# @param str [String] The String to
|
192
|
+
# @param str [String] The String to sanitize. str is modified.
|
173
193
|
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
174
194
|
# invalid characters.
|
175
|
-
# @return [String] The
|
176
|
-
def self.
|
195
|
+
# @return [String] The sanitized str is both modified and then returned.
|
196
|
+
def self.sanitize_str(str, encode: true)
|
177
197
|
if str.is_a?(String)
|
178
198
|
str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
|
179
199
|
str.strip!
|
@@ -182,15 +202,15 @@ module Wgit
|
|
182
202
|
str
|
183
203
|
end
|
184
204
|
|
185
|
-
#
|
186
|
-
# processes non empty Strings using Wgit::Utils.
|
205
|
+
# Sanitises an Array to make it uniform. Removes empty Strings and nils,
|
206
|
+
# processes non empty Strings using Wgit::Utils.sanitize and removes
|
187
207
|
# duplicates.
|
188
208
|
#
|
189
|
-
# @param arr [Enumerable] The Array to
|
190
|
-
# @return [Enumerable] The
|
191
|
-
def self.
|
209
|
+
# @param arr [Enumerable] The Array to sanitize. arr is modified.
|
210
|
+
# @return [Enumerable] The sanitized arr is both modified and then returned.
|
211
|
+
def self.sanitize_arr(arr, encode: true)
|
192
212
|
if arr.is_a?(Array)
|
193
|
-
arr.map! { |str|
|
213
|
+
arr.map! { |str| sanitize(str, encode: encode) }
|
194
214
|
arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
195
215
|
arr.compact!
|
196
216
|
arr.uniq!
|
@@ -198,13 +218,5 @@ module Wgit
|
|
198
218
|
|
199
219
|
arr
|
200
220
|
end
|
201
|
-
|
202
|
-
# Returns the model having removed non bson types (for use with MongoDB).
|
203
|
-
#
|
204
|
-
# @param model_hash [Hash] The model Hash to process.
|
205
|
-
# @return [Hash] The model Hash with non bson types removed.
|
206
|
-
def self.remove_non_bson_types(model_hash)
|
207
|
-
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
208
|
-
end
|
209
221
|
end
|
210
222
|
end
|
data/lib/wgit/version.rb
CHANGED
@@ -2,10 +2,11 @@
|
|
2
2
|
|
3
3
|
# Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
|
4
4
|
# contents for later use.
|
5
|
+
#
|
5
6
|
# @author Michael Telford
|
6
7
|
module Wgit
|
7
8
|
# The current gem version of Wgit.
|
8
|
-
VERSION = '0.
|
9
|
+
VERSION = '0.10.2'
|
9
10
|
|
10
11
|
# Returns the current gem version of Wgit as a String.
|
11
12
|
def self.version
|
data/lib/wgit.rb
CHANGED
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
|
|
6
6
|
require_relative 'wgit/utils'
|
7
7
|
require_relative 'wgit/url'
|
8
8
|
require_relative 'wgit/document'
|
9
|
-
require_relative 'wgit/
|
9
|
+
require_relative 'wgit/document_extractors'
|
10
10
|
require_relative 'wgit/crawler'
|
11
11
|
require_relative 'wgit/database/model'
|
12
12
|
require_relative 'wgit/database/database'
|
13
13
|
require_relative 'wgit/indexer'
|
14
|
+
require_relative 'wgit/dsl'
|
15
|
+
require_relative 'wgit/base'
|
14
16
|
# require_relative 'wgit/core_ext' - Must be explicitly required.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-11-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.3'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: ferrum
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.8'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.8'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: byebug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,14 +198,10 @@ dependencies:
|
|
184
198
|
- - "<"
|
185
199
|
- !ruby/object:Gem::Version
|
186
200
|
version: '1.0'
|
187
|
-
description: '
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
WWW search engine. The Wgit API is easily extended allowing you to pull out the
|
192
|
-
parts of a webpage that are important to you, the code snippets or tables for example.
|
193
|
-
As Wgit is a library, it supports many different use cases including data mining,
|
194
|
-
analytics, web indexing and URL parsing to name a few.
|
201
|
+
description: 'Wgit was primarily designed to crawl static HTML websites to index and
|
202
|
+
search their content - providing the basis of any search engine; but Wgit is suitable
|
203
|
+
for many application domains including: URL parsing, data mining and statistical
|
204
|
+
analysis.
|
195
205
|
|
196
206
|
'
|
197
207
|
email: michael.telford@live.com
|
@@ -202,12 +212,14 @@ extra_rdoc_files: []
|
|
202
212
|
files:
|
203
213
|
- "./lib/wgit.rb"
|
204
214
|
- "./lib/wgit/assertable.rb"
|
215
|
+
- "./lib/wgit/base.rb"
|
205
216
|
- "./lib/wgit/core_ext.rb"
|
206
217
|
- "./lib/wgit/crawler.rb"
|
207
218
|
- "./lib/wgit/database/database.rb"
|
208
219
|
- "./lib/wgit/database/model.rb"
|
209
220
|
- "./lib/wgit/document.rb"
|
210
|
-
- "./lib/wgit/
|
221
|
+
- "./lib/wgit/document_extractors.rb"
|
222
|
+
- "./lib/wgit/dsl.rb"
|
211
223
|
- "./lib/wgit/indexer.rb"
|
212
224
|
- "./lib/wgit/logger.rb"
|
213
225
|
- "./lib/wgit/response.rb"
|
@@ -229,7 +241,7 @@ metadata:
|
|
229
241
|
source_code_uri: https://github.com/michaeltelford/wgit
|
230
242
|
changelog_uri: https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md
|
231
243
|
bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
|
232
|
-
documentation_uri: https://www.rubydoc.info/
|
244
|
+
documentation_uri: https://www.rubydoc.info/gems/wgit
|
233
245
|
allowed_push_host: https://rubygems.org
|
234
246
|
post_install_message: Added the 'wgit' executable to $PATH
|
235
247
|
rdoc_options: []
|
@@ -237,18 +249,21 @@ require_paths:
|
|
237
249
|
- lib
|
238
250
|
required_ruby_version: !ruby/object:Gem::Requirement
|
239
251
|
requirements:
|
240
|
-
- - "
|
252
|
+
- - ">="
|
253
|
+
- !ruby/object:Gem::Version
|
254
|
+
version: '2.6'
|
255
|
+
- - "<"
|
241
256
|
- !ruby/object:Gem::Version
|
242
|
-
version: '
|
257
|
+
version: '4'
|
243
258
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
244
259
|
requirements:
|
245
260
|
- - ">="
|
246
261
|
- !ruby/object:Gem::Version
|
247
262
|
version: '0'
|
248
263
|
requirements: []
|
249
|
-
rubygems_version: 3.
|
250
|
-
signing_key:
|
264
|
+
rubygems_version: 3.2.22
|
265
|
+
signing_key:
|
251
266
|
specification_version: 4
|
252
|
-
summary: Wgit is a
|
253
|
-
|
267
|
+
summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
|
268
|
+
extract the data you want from the web.
|
254
269
|
test_files: []
|