wgit 0.7.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +74 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -290
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +219 -79
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +226 -143
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +11 -8
- data/lib/wgit/url.rb +192 -61
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/url.rb
CHANGED
@@ -6,15 +6,15 @@ require 'uri'
|
|
6
6
|
require 'addressable/uri'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class modeling a web based HTTP URL.
|
9
|
+
# Class modeling/serialising a web based HTTP URL.
|
10
10
|
#
|
11
11
|
# Can be an internal/relative link e.g. "about.html" or an absolute URL
|
12
|
-
# e.g. "http://www.google.co.uk". Is a subclass of String and uses
|
13
|
-
#
|
12
|
+
# e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
|
13
|
+
# `addressable/uri` internally for parsing.
|
14
14
|
#
|
15
|
-
# Most of the methods in this class return new Wgit::Url instances making
|
16
|
-
# method calls chainable e.g. url.omit_base.omit_fragment etc. The
|
17
|
-
# also try to be idempotent where possible.
|
15
|
+
# Most of the methods in this class return new `Wgit::Url` instances making
|
16
|
+
# the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
|
17
|
+
# methods also try to be idempotent where possible.
|
18
18
|
class Url < String
|
19
19
|
include Assertable
|
20
20
|
|
@@ -28,7 +28,7 @@ module Wgit
|
|
28
28
|
# The duration of the crawl for this Url (in seconds).
|
29
29
|
attr_accessor :crawl_duration
|
30
30
|
|
31
|
-
# Initializes a new instance of Wgit::Url which
|
31
|
+
# Initializes a new instance of Wgit::Url which models a web based
|
32
32
|
# HTTP URL.
|
33
33
|
#
|
34
34
|
# @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
|
@@ -90,6 +90,23 @@ module Wgit
|
|
90
90
|
obj.is_a?(Wgit::Url) ? obj : new(obj)
|
91
91
|
end
|
92
92
|
|
93
|
+
# Returns a Wgit::Url instance from Wgit::Url.parse, or nil if obj cannot
|
94
|
+
# be parsed successfully e.g. the String is invalid.
|
95
|
+
#
|
96
|
+
# Use this method when you can't gaurentee that obj is parsable as a URL.
|
97
|
+
# See Wgit::Url.parse for more information.
|
98
|
+
#
|
99
|
+
# @param obj [Object] The object to parse, which #is_a?(String).
|
100
|
+
# @raise [StandardError] If obj.is_a?(String) is false.
|
101
|
+
# @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
|
102
|
+
def self.parse?(obj)
|
103
|
+
parse(obj)
|
104
|
+
rescue Addressable::URI::InvalidURIError
|
105
|
+
Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
|
106
|
+
Addressable::URI::InvalidURIError")
|
107
|
+
nil
|
108
|
+
end
|
109
|
+
|
93
110
|
# Sets the @crawled instance var, also setting @date_crawled for
|
94
111
|
# convenience.
|
95
112
|
#
|
@@ -98,8 +115,6 @@ module Wgit
|
|
98
115
|
def crawled=(bool)
|
99
116
|
@crawled = bool
|
100
117
|
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
101
|
-
|
102
|
-
bool
|
103
118
|
end
|
104
119
|
|
105
120
|
# Overrides String#replace setting the new_url @uri and String value.
|
@@ -129,10 +144,10 @@ module Wgit
|
|
129
144
|
# @param opts [Hash] The options with which to check relativity. Only one
|
130
145
|
# opts param should be provided. The provided opts param Url must be
|
131
146
|
# absolute and be prefixed with a scheme. Consider using the output of
|
132
|
-
# Wgit::Url#
|
133
|
-
# @option opts [Wgit::Url, String] :
|
134
|
-
# http://www.google.com/how which gives a
|
135
|
-
# 'http://www.google.com'.
|
147
|
+
# Wgit::Url#to_origin which should work (unless it's nil).
|
148
|
+
# @option opts [Wgit::Url, String] :origin The Url origin e.g.
|
149
|
+
# http://www.google.com:81/how which gives a origin of
|
150
|
+
# 'http://www.google.com:81'.
|
136
151
|
# @option opts [Wgit::Url, String] :host The Url host e.g.
|
137
152
|
# http://www.google.com/how which gives a host of 'www.google.com'.
|
138
153
|
# @option opts [Wgit::Url, String] :domain The Url domain e.g.
|
@@ -143,10 +158,11 @@ module Wgit
|
|
143
158
|
# param has been provided.
|
144
159
|
# @return [Boolean] True if relative, false if absolute.
|
145
160
|
def relative?(opts = {})
|
146
|
-
defaults = {
|
161
|
+
defaults = { origin: nil, host: nil, domain: nil, brand: nil }
|
147
162
|
opts = defaults.merge(opts)
|
148
163
|
raise 'Url (self) cannot be empty' if empty?
|
149
164
|
|
165
|
+
return false if scheme_relative?
|
150
166
|
return true if @uri.relative?
|
151
167
|
|
152
168
|
# Self is absolute but may be relative to the opts param e.g. host.
|
@@ -163,8 +179,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
163
179
|
end
|
164
180
|
|
165
181
|
case type
|
166
|
-
when :
|
167
|
-
|
182
|
+
when :origin # http://www.google.com:81
|
183
|
+
to_origin == url.to_origin
|
168
184
|
when :host # www.google.com
|
169
185
|
to_host == url.to_host
|
170
186
|
when :domain # google.com
|
@@ -189,8 +205,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
189
205
|
# @return [Boolean] True if valid, absolute and crawable, otherwise false.
|
190
206
|
def valid?
|
191
207
|
return false if relative?
|
192
|
-
return false unless
|
193
|
-
return false
|
208
|
+
return false unless to_origin && to_domain
|
209
|
+
return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
|
194
210
|
|
195
211
|
true
|
196
212
|
end
|
@@ -221,7 +237,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
221
237
|
Wgit::Url.new(concatted)
|
222
238
|
end
|
223
239
|
|
224
|
-
#
|
240
|
+
# Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
|
241
|
+
# This should be used before GET'ing the url, in case it has IRI chars.
|
225
242
|
#
|
226
243
|
# @return [Wgit::Url] An escaped version of self.
|
227
244
|
def normalize
|
@@ -232,8 +249,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
232
249
|
# modify the receiver.
|
233
250
|
#
|
234
251
|
# If self is absolute then it's returned as is, making this method
|
235
|
-
# idempotent. The doc's
|
236
|
-
# doc.url is used as the base; which is concatted with self.
|
252
|
+
# idempotent. The doc's `<base>` element is used if present, otherwise
|
253
|
+
# `doc.url` is used as the base; which is concatted with self.
|
237
254
|
#
|
238
255
|
# Typically used to build an absolute link obtained from a document.
|
239
256
|
#
|
@@ -241,35 +258,37 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
241
258
|
# link = Wgit::Url.new('/favicon.png')
|
242
259
|
# doc = Wgit::Document.new('http://example.com')
|
243
260
|
#
|
244
|
-
# link.
|
261
|
+
# link.make_absolute(doc) # => "http://example.com/favicon.png"
|
245
262
|
#
|
246
263
|
# @param doc [Wgit::Document] The doc whose base Url is concatted with
|
247
264
|
# self.
|
248
265
|
# @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
|
249
266
|
# raises an Exception.
|
250
267
|
# @return [Wgit::Url] Self in absolute form.
|
251
|
-
def
|
268
|
+
def make_absolute(doc)
|
252
269
|
assert_type(doc, Wgit::Document)
|
270
|
+
raise 'Cannot make absolute when Document @url is not valid' \
|
271
|
+
unless doc.url.valid?
|
272
|
+
|
273
|
+
return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
|
253
274
|
|
254
275
|
absolute? ? self : doc.base_url(link: self).concat(self)
|
255
276
|
end
|
256
277
|
|
257
|
-
# Returns self having prefixed a protocol
|
278
|
+
# Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
|
258
279
|
# Returns self even if absolute (with scheme); therefore is idempotent.
|
259
280
|
#
|
260
|
-
# @param
|
261
|
-
# @return [Wgit::Url] Self with a
|
262
|
-
def prefix_scheme(
|
263
|
-
|
264
|
-
|
265
|
-
case protocol
|
266
|
-
when :http
|
267
|
-
Wgit::Url.new("http://#{url}")
|
268
|
-
when :https
|
269
|
-
Wgit::Url.new("https://#{url}")
|
270
|
-
else
|
271
|
-
raise "protocol must be :http or :https, not :#{protocol}"
|
281
|
+
# @param scheme [Symbol] Either :http or :https.
|
282
|
+
# @return [Wgit::Url] Self with a scheme prefix.
|
283
|
+
def prefix_scheme(scheme = :http)
|
284
|
+
unless %i[http https].include?(scheme)
|
285
|
+
raise "scheme must be :http or :https, not :#{scheme}"
|
272
286
|
end
|
287
|
+
|
288
|
+
return self if absolute? && !scheme_relative?
|
289
|
+
|
290
|
+
separator = scheme_relative? ? '' : '//'
|
291
|
+
Wgit::Url.new("#{scheme}:#{separator}#{self}")
|
273
292
|
end
|
274
293
|
|
275
294
|
# Returns a Hash containing this Url's instance vars excluding @uri.
|
@@ -277,8 +296,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
277
296
|
#
|
278
297
|
# @return [Hash] self's instance vars as a Hash.
|
279
298
|
def to_h
|
280
|
-
|
281
|
-
h = Wgit::Utils.to_h(self, ignore: ignore)
|
299
|
+
h = Wgit::Utils.to_h(self, ignore: ['@uri'])
|
282
300
|
Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
|
283
301
|
end
|
284
302
|
|
@@ -321,6 +339,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
321
339
|
host ? Wgit::Url.new(host) : nil
|
322
340
|
end
|
323
341
|
|
342
|
+
# Returns a new Wgit::Url containing just the port of this URL e.g.
|
343
|
+
# Given http://www.google.co.uk:443/about.html, '443' is returned.
|
344
|
+
#
|
345
|
+
# @return [Wgit::Url, nil] Containing just the port or nil.
|
346
|
+
def to_port
|
347
|
+
port = @uri.port
|
348
|
+
|
349
|
+
# @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
|
350
|
+
return nil unless port
|
351
|
+
return nil unless include?(":#{port}")
|
352
|
+
|
353
|
+
Wgit::Url.new(port.to_s)
|
354
|
+
end
|
355
|
+
|
324
356
|
# Returns a new Wgit::Url containing just the domain of this URL e.g.
|
325
357
|
# Given http://www.google.co.uk/about.html, google.co.uk is returned.
|
326
358
|
#
|
@@ -330,6 +362,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
330
362
|
domain ? Wgit::Url.new(domain) : nil
|
331
363
|
end
|
332
364
|
|
365
|
+
# Returns a new Wgit::Url containing just the sub domain of this URL e.g.
|
366
|
+
# Given http://scripts.dev.google.com, scripts.dev is returned.
|
367
|
+
#
|
368
|
+
# @return [Wgit::Url, nil] Containing just the sub domain or nil.
|
369
|
+
def to_sub_domain
|
370
|
+
return nil unless to_host
|
371
|
+
|
372
|
+
dot_domain = ".#{to_domain}"
|
373
|
+
return nil unless include?(dot_domain)
|
374
|
+
|
375
|
+
sub_domain = to_host.sub(dot_domain, '')
|
376
|
+
Wgit::Url.new(sub_domain)
|
377
|
+
end
|
378
|
+
|
333
379
|
# Returns a new Wgit::Url containing just the brand of this URL e.g.
|
334
380
|
# Given http://www.google.co.uk/about.html, google is returned.
|
335
381
|
#
|
@@ -345,12 +391,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
345
391
|
# @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
|
346
392
|
# nil.
|
347
393
|
def to_base
|
348
|
-
return nil
|
394
|
+
return nil unless @uri.scheme && @uri.host
|
349
395
|
|
350
396
|
base = "#{@uri.scheme}://#{@uri.host}"
|
351
397
|
Wgit::Url.new(base)
|
352
398
|
end
|
353
399
|
|
400
|
+
# Returns only the origin of this URL e.g. the protocol scheme, host and
|
401
|
+
# port combined. For http://localhost:3000/api, http://localhost:3000 gets
|
402
|
+
# returned. If there's no port present, then to_base is returned.
|
403
|
+
#
|
404
|
+
# @return [Wgit::Url, nil] The origin of self or nil.
|
405
|
+
def to_origin
|
406
|
+
return nil unless to_base
|
407
|
+
return to_base unless to_port
|
408
|
+
|
409
|
+
Wgit::Url.new("#{to_base}:#{to_port}")
|
410
|
+
end
|
411
|
+
|
354
412
|
# Returns the path of this URL e.g. the bit after the host without slashes.
|
355
413
|
# For example:
|
356
414
|
# Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
|
@@ -379,7 +437,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
379
437
|
end
|
380
438
|
|
381
439
|
# Returns a new Wgit::Url containing just the query string of this URL
|
382
|
-
# e.g. Given http://google.com?q=
|
440
|
+
# e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
|
383
441
|
#
|
384
442
|
# @return [Wgit::Url, nil] Containing just the query string or nil.
|
385
443
|
def to_query
|
@@ -387,6 +445,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
387
445
|
query ? Wgit::Url.new(query) : nil
|
388
446
|
end
|
389
447
|
|
448
|
+
# Returns a Hash containing just the query string parameters of this URL
|
449
|
+
# e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
|
450
|
+
#
|
451
|
+
# @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
|
452
|
+
# true, Strings otherwise.
|
453
|
+
# @return [Hash<String | Symbol, String>] Containing the query string
|
454
|
+
# params or empty if the URL doesn't contain any query parameters.
|
455
|
+
def to_query_hash(symbolize_keys: false)
|
456
|
+
query_str = to_query
|
457
|
+
return {} unless query_str
|
458
|
+
|
459
|
+
query_str.split('&').each_with_object({}) do |param, hash|
|
460
|
+
k, v = param.split('=')
|
461
|
+
k = k.to_sym if symbolize_keys
|
462
|
+
hash[k] = v
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
390
466
|
# Returns a new Wgit::Url containing just the fragment string of this URL
|
391
467
|
# e.g. Given http://google.com#about, #about is returned.
|
392
468
|
#
|
@@ -408,6 +484,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
408
484
|
segs.length > 1 ? Wgit::Url.new(segs.last) : nil
|
409
485
|
end
|
410
486
|
|
487
|
+
# Returns a new Wgit::Url containing just the username string of this URL
|
488
|
+
# e.g. Given http://me:pass1@example.com, me is returned.
|
489
|
+
#
|
490
|
+
# @return [Wgit::Url, nil] Containing just the user string or nil.
|
491
|
+
def to_user
|
492
|
+
user = @uri.user
|
493
|
+
user ? Wgit::Url.new(user) : nil
|
494
|
+
end
|
495
|
+
|
496
|
+
# Returns a new Wgit::Url containing just the password string of this URL
|
497
|
+
# e.g. Given http://me:pass1@example.com, pass1 is returned.
|
498
|
+
#
|
499
|
+
# @return [Wgit::Url, nil] Containing just the password string or nil.
|
500
|
+
def to_password
|
501
|
+
password = @uri.password
|
502
|
+
password ? Wgit::Url.new(password) : nil
|
503
|
+
end
|
504
|
+
|
411
505
|
# Omits the given URL components from self and returns a new Wgit::Url.
|
412
506
|
#
|
413
507
|
# Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
|
@@ -451,7 +545,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
451
545
|
.omit_trailing_slash
|
452
546
|
end
|
453
547
|
|
454
|
-
# Returns a new Wgit::Url with the base (
|
548
|
+
# Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
|
455
549
|
# http://google.com/search?q=something#about, search?q=something#about is
|
456
550
|
# returned. If relative and base isn't present then self is returned.
|
457
551
|
# Leading and trailing slashes are always stripped from the return value.
|
@@ -466,6 +560,21 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
466
560
|
Wgit::Url.new(omit_base).omit_slashes
|
467
561
|
end
|
468
562
|
|
563
|
+
# Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
|
564
|
+
# http://google.com:81/search?q=something#about, search?q=something#about is
|
565
|
+
# returned. If relative and base isn't present then self is returned.
|
566
|
+
# Leading and trailing slashes are always stripped from the return value.
|
567
|
+
#
|
568
|
+
# @return [Wgit::Url] Self containing everything after the origin.
|
569
|
+
def omit_origin
|
570
|
+
origin = to_origin
|
571
|
+
omit_origin = origin ? gsub(origin, '') : self
|
572
|
+
|
573
|
+
return self if ['', '/'].include?(omit_origin)
|
574
|
+
|
575
|
+
Wgit::Url.new(omit_origin).omit_slashes
|
576
|
+
end
|
577
|
+
|
469
578
|
# Returns a new Wgit::Url with the query string portion removed e.g. Given
|
470
579
|
# http://google.com/search?q=hello, http://google.com/search is
|
471
580
|
# returned. Self is returned as is if no query string is present. A URL
|
@@ -511,25 +620,47 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
511
620
|
start_with?('#')
|
512
621
|
end
|
513
622
|
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
alias
|
530
|
-
alias
|
531
|
-
alias
|
532
|
-
alias
|
533
|
-
alias
|
623
|
+
# Returns true if self equals '/' a.k.a. index.
|
624
|
+
#
|
625
|
+
# @return [Boolean] True if self equals '/', false otherwise.
|
626
|
+
def index?
|
627
|
+
self == '/'
|
628
|
+
end
|
629
|
+
|
630
|
+
# Returns true if self starts with '//' a.k.a a scheme/protocol relative
|
631
|
+
# path.
|
632
|
+
#
|
633
|
+
# @return [Boolean] True if self starts with '//', false otherwise.
|
634
|
+
def scheme_relative?
|
635
|
+
start_with?('//')
|
636
|
+
end
|
637
|
+
|
638
|
+
alias + concat
|
639
|
+
alias crawled? crawled
|
640
|
+
alias is_relative? relative?
|
641
|
+
alias is_absolute? absolute?
|
642
|
+
alias is_valid? valid?
|
643
|
+
alias is_query? query?
|
644
|
+
alias is_fragment? fragment?
|
645
|
+
alias is_index? index?
|
646
|
+
alias is_scheme_relative? scheme_relative?
|
647
|
+
alias uri to_uri
|
648
|
+
alias url to_url
|
649
|
+
alias scheme to_scheme
|
650
|
+
alias host to_host
|
651
|
+
alias port to_port
|
652
|
+
alias domain to_domain
|
653
|
+
alias brand to_brand
|
654
|
+
alias base to_base
|
655
|
+
alias origin to_origin
|
656
|
+
alias path to_path
|
657
|
+
alias endpoint to_endpoint
|
658
|
+
alias query to_query
|
659
|
+
alias query_hash to_query_hash
|
660
|
+
alias fragment to_fragment
|
661
|
+
alias extension to_extension
|
662
|
+
alias user to_user
|
663
|
+
alias password to_password
|
664
|
+
alias sub_domain to_sub_domain
|
534
665
|
end
|
535
666
|
end
|
data/lib/wgit/utils.rb
CHANGED
@@ -145,7 +145,8 @@ module Wgit
|
|
145
145
|
# @param keyword_limit [Integer] The max amount of keywords to be
|
146
146
|
# outputted to the stream.
|
147
147
|
# @param stream [#puts] Any object that respond_to?(:puts). It is used
|
148
|
-
# to output text somewhere e.g. a file or
|
148
|
+
# to output text somewhere e.g. a file or STDERR.
|
149
|
+
# @return [Integer] The number of results.
|
149
150
|
def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
|
150
151
|
raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
|
151
152
|
|
@@ -162,18 +163,37 @@ module Wgit
|
|
162
163
|
stream.puts
|
163
164
|
end
|
164
165
|
|
165
|
-
|
166
|
+
results.size
|
166
167
|
end
|
167
168
|
|
168
|
-
#
|
169
|
+
# Sanitises the obj to make it uniform by calling the correct sanitize_*
|
170
|
+
# method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
|
171
|
+
# not in the case statement will be ignored and returned as is.
|
172
|
+
#
|
173
|
+
# @param obj [Object] The object to be sanitized.
|
174
|
+
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
175
|
+
# invalid characters.
|
176
|
+
# @return [Object] The sanitized obj is both modified and then returned.
|
177
|
+
def self.sanitize(obj, encode: true)
|
178
|
+
case obj
|
179
|
+
when String
|
180
|
+
sanitize_str(obj, encode: encode)
|
181
|
+
when Array
|
182
|
+
sanitize_arr(obj, encode: encode)
|
183
|
+
else
|
184
|
+
obj
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Sanitises a String to make it uniform. Strips any leading/trailing white
|
169
189
|
# space. Also applies UTF-8 encoding (replacing invalid characters) if
|
170
190
|
# `encode: true`.
|
171
191
|
#
|
172
|
-
# @param str [String] The String to
|
192
|
+
# @param str [String] The String to sanitize. str is modified.
|
173
193
|
# @param encode [Boolean] Whether or not to encode to UTF-8 replacing
|
174
194
|
# invalid characters.
|
175
|
-
# @return [String] The
|
176
|
-
def self.
|
195
|
+
# @return [String] The sanitized str is both modified and then returned.
|
196
|
+
def self.sanitize_str(str, encode: true)
|
177
197
|
if str.is_a?(String)
|
178
198
|
str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
|
179
199
|
str.strip!
|
@@ -182,15 +202,15 @@ module Wgit
|
|
182
202
|
str
|
183
203
|
end
|
184
204
|
|
185
|
-
#
|
186
|
-
# processes non empty Strings using Wgit::Utils.
|
205
|
+
# Sanitises an Array to make it uniform. Removes empty Strings and nils,
|
206
|
+
# processes non empty Strings using Wgit::Utils.sanitize and removes
|
187
207
|
# duplicates.
|
188
208
|
#
|
189
|
-
# @param arr [Enumerable] The Array to
|
190
|
-
# @return [Enumerable] The
|
191
|
-
def self.
|
209
|
+
# @param arr [Enumerable] The Array to sanitize. arr is modified.
|
210
|
+
# @return [Enumerable] The sanitized arr is both modified and then returned.
|
211
|
+
def self.sanitize_arr(arr, encode: true)
|
192
212
|
if arr.is_a?(Array)
|
193
|
-
arr.map! { |str|
|
213
|
+
arr.map! { |str| sanitize(str, encode: encode) }
|
194
214
|
arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
195
215
|
arr.compact!
|
196
216
|
arr.uniq!
|
@@ -198,13 +218,5 @@ module Wgit
|
|
198
218
|
|
199
219
|
arr
|
200
220
|
end
|
201
|
-
|
202
|
-
# Returns the model having removed non bson types (for use with MongoDB).
|
203
|
-
#
|
204
|
-
# @param model_hash [Hash] The model Hash to process.
|
205
|
-
# @return [Hash] The model Hash with non bson types removed.
|
206
|
-
def self.remove_non_bson_types(model_hash)
|
207
|
-
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
208
|
-
end
|
209
221
|
end
|
210
222
|
end
|
data/lib/wgit/version.rb
CHANGED
@@ -2,10 +2,11 @@
|
|
2
2
|
|
3
3
|
# Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
|
4
4
|
# contents for later use.
|
5
|
+
#
|
5
6
|
# @author Michael Telford
|
6
7
|
module Wgit
|
7
8
|
# The current gem version of Wgit.
|
8
|
-
VERSION = '0.
|
9
|
+
VERSION = '0.10.1'
|
9
10
|
|
10
11
|
# Returns the current gem version of Wgit as a String.
|
11
12
|
def self.version
|
data/lib/wgit.rb
CHANGED
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
|
|
6
6
|
require_relative 'wgit/utils'
|
7
7
|
require_relative 'wgit/url'
|
8
8
|
require_relative 'wgit/document'
|
9
|
-
require_relative 'wgit/
|
9
|
+
require_relative 'wgit/document_extractors'
|
10
10
|
require_relative 'wgit/crawler'
|
11
11
|
require_relative 'wgit/database/model'
|
12
12
|
require_relative 'wgit/database/database'
|
13
13
|
require_relative 'wgit/indexer'
|
14
|
+
require_relative 'wgit/dsl'
|
15
|
+
require_relative 'wgit/base'
|
14
16
|
# require_relative 'wgit/core_ext' - Must be explicitly required.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-11-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.3'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: ferrum
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.8'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.8'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: byebug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,14 +198,10 @@ dependencies:
|
|
184
198
|
- - "<"
|
185
199
|
- !ruby/object:Gem::Version
|
186
200
|
version: '1.0'
|
187
|
-
description: '
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
WWW search engine. The Wgit API is easily extended allowing you to pull out the
|
192
|
-
parts of a webpage that are important to you, the code snippets or tables for example.
|
193
|
-
As Wgit is a library, it supports many different use cases including data mining,
|
194
|
-
analytics, web indexing and URL parsing to name a few.
|
201
|
+
description: 'Wgit was primarily designed to crawl static HTML websites to index and
|
202
|
+
search their content - providing the basis of any search engine; but Wgit is suitable
|
203
|
+
for many application domains including: URL parsing, data mining and statistical
|
204
|
+
analysis.
|
195
205
|
|
196
206
|
'
|
197
207
|
email: michael.telford@live.com
|
@@ -202,12 +212,14 @@ extra_rdoc_files: []
|
|
202
212
|
files:
|
203
213
|
- "./lib/wgit.rb"
|
204
214
|
- "./lib/wgit/assertable.rb"
|
215
|
+
- "./lib/wgit/base.rb"
|
205
216
|
- "./lib/wgit/core_ext.rb"
|
206
217
|
- "./lib/wgit/crawler.rb"
|
207
218
|
- "./lib/wgit/database/database.rb"
|
208
219
|
- "./lib/wgit/database/model.rb"
|
209
220
|
- "./lib/wgit/document.rb"
|
210
|
-
- "./lib/wgit/
|
221
|
+
- "./lib/wgit/document_extractors.rb"
|
222
|
+
- "./lib/wgit/dsl.rb"
|
211
223
|
- "./lib/wgit/indexer.rb"
|
212
224
|
- "./lib/wgit/logger.rb"
|
213
225
|
- "./lib/wgit/response.rb"
|
@@ -229,7 +241,7 @@ metadata:
|
|
229
241
|
source_code_uri: https://github.com/michaeltelford/wgit
|
230
242
|
changelog_uri: https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md
|
231
243
|
bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
|
232
|
-
documentation_uri: https://www.rubydoc.info/
|
244
|
+
documentation_uri: https://www.rubydoc.info/gems/wgit
|
233
245
|
allowed_push_host: https://rubygems.org
|
234
246
|
post_install_message: Added the 'wgit' executable to $PATH
|
235
247
|
rdoc_options: []
|
@@ -237,18 +249,21 @@ require_paths:
|
|
237
249
|
- lib
|
238
250
|
required_ruby_version: !ruby/object:Gem::Requirement
|
239
251
|
requirements:
|
240
|
-
- - "
|
252
|
+
- - ">="
|
253
|
+
- !ruby/object:Gem::Version
|
254
|
+
version: '2.6'
|
255
|
+
- - "<"
|
241
256
|
- !ruby/object:Gem::Version
|
242
|
-
version: '
|
257
|
+
version: '4'
|
243
258
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
244
259
|
requirements:
|
245
260
|
- - ">="
|
246
261
|
- !ruby/object:Gem::Version
|
247
262
|
version: '0'
|
248
263
|
requirements: []
|
249
|
-
rubygems_version: 3.
|
250
|
-
signing_key:
|
264
|
+
rubygems_version: 3.2.22
|
265
|
+
signing_key:
|
251
266
|
specification_version: 4
|
252
|
-
summary: Wgit is a
|
253
|
-
|
267
|
+
summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
|
268
|
+
extract the data you want from the web.
|
254
269
|
test_files: []
|